[𝘀𝗽𝗿] changes introduced through rebaseupstream/users/vitalybuka/spr/main.clangcodegen-remove-simplifycfgpass-preceding-removetrapspass

Created using spr 1.3.4 [skip ci]
author: Vitaly Buka <vitalybuka@google.com> 2024-04-04 17:42:50 -0700
committer: Vitaly Buka <vitalybuka@google.com> 2024-04-04 17:42:50 -0700
commit: 4021d3899046d5a27ffe48c607a19c7d4a86df14 (patch)
tree: e89f3b2540893ace6ab1c54767ef62c0f178c2ea
parent: 9877386e2390c42022a566f8fa6fd1f6df08c0ad (diff)
parent: a9d93873f857963eeb9ef7f65a725e6aaf99c958 (diff)
529 files changed, 29028 insertions, 12957 deletions
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 4a881ef5ff56..1e9367732e59 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -38,11 +38,11 @@ env:
   # LLVM POST-BRANCH bump version
   # LLVM POST-BRANCH add compiler test for ToT - 1, e.g. "Clang 17"
   # LLVM RELEASE bump remove compiler ToT - 3, e.g. "Clang 15"
-  LLVM_HEAD_VERSION: "18"   # Used compiler, update POST-BRANCH.
-  LLVM_PREVIOUS_VERSION: "17"
-  LLVM_OLDEST_VERSION: "16"
+  LLVM_HEAD_VERSION: "19"   # Used compiler, update POST-BRANCH.
+  LLVM_PREVIOUS_VERSION: "18"
+  LLVM_OLDEST_VERSION: "17"
   GCC_STABLE_VERSION: "13"
-  LLVM_SYMBOLIZER_PATH: "/usr/bin/llvm-symbolizer-18"
+  LLVM_SYMBOLIZER_PATH: "/usr/bin/llvm-symbolizer-19"
   CLANG_CRASH_DIAGNOSTICS_DIR: "crash_diagnostics"
 
 
@@ -59,8 +59,8 @@ jobs:
           'generic-cxx26',
           'generic-modules'
         ]
-        cc: [  'clang-18' ]
-        cxx: [ 'clang++-18' ]
+        cc: [  'clang-19' ]
+        cxx: [ 'clang++-19' ]
         clang_tidy: [ 'ON' ]
         include:
           - config: 'generic-gcc'
@@ -100,8 +100,8 @@ jobs:
           'generic-cxx20',
           'generic-cxx23'
         ]
-        cc: [ 'clang-18' ]
-        cxx: [ 'clang++-18' ]
+        cc: [ 'clang-19' ]
+        cxx: [ 'clang++-19' ]
         clang_tidy: [ 'ON' ]
         include:
           - config: 'generic-gcc-cxx11'
@@ -109,13 +109,13 @@ jobs:
             cxx: 'g++-13'
             clang_tidy: 'OFF'
           - config: 'generic-cxx23'
-            cc: 'clang-16'
-            cxx: 'clang++-16'
-            clang_tidy: 'OFF'
-          - config: 'generic-cxx23'
             cc: 'clang-17'
             cxx: 'clang++-17'
             clang_tidy: 'OFF'
+          - config: 'generic-cxx26'
+            cc: 'clang-18'
+            cxx: 'clang++-18'
+            clang_tidy: 'ON'
     steps:
       - uses: actions/checkout@v4
       - name: ${{ matrix.config }}
@@ -186,8 +186,8 @@ jobs:
       - name: ${{ matrix.config }}
         run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
         env:
-          CC: clang-18
-          CXX: clang++-18
+          CC: clang-19
+          CXX: clang++-19
           ENABLE_CLANG_TIDY: "OFF"
       - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
         if: always()
diff --git a/clang-tools-extra/clangd/IncludeCleaner.h b/clang-tools-extra/clangd/IncludeCleaner.h
index 387763de3407..624e2116be7d 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.h
+++ b/clang-tools-extra/clangd/IncludeCleaner.h
@@ -62,15 +62,6 @@ issueIncludeCleanerDiagnostics(ParsedAST &AST, llvm::StringRef Code,
                                const ThreadsafeFS &TFS,
                                HeaderFilter IgnoreHeader = {});
 
-/// Affects whether standard library includes should be considered for
-/// removal. This is off by default for now due to implementation limitations:
-/// - macros are not tracked
-/// - symbol names without a unique associated header are not tracked
-/// - references to std-namespaced C types are not properly tracked:
-///   instead of std::size_t -> <cstddef> we see ::size_t -> <stddef.h>
-/// FIXME: remove this hack once the implementation is good enough.
-void setIncludeCleanerAnalyzesStdlib(bool B);
-
 /// Converts the clangd include representation to include-cleaner
 /// include representation.
 include_cleaner::Includes convertIncludes(const ParsedAST &);
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index c30bccd06674..9a65f76cdf56 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -43,6 +43,15 @@ enum class ComparisonResult {
   Unknown,
 };
 
+/// The result of a `widen` operation.
+struct WidenResult {
+  /// Non-null pointer to a potentially widened version of the input value.
+  Value *V;
+  /// Whether `V` represents a "change" (that is, a different value) with
+  /// respect to the previous value in the sequence.
+  LatticeEffect Effect;
+};
+
 /// Holds the state of the program (store and heap) at a given program point.
 ///
 /// WARNING: Symbolic values that are created by the environment for static
@@ -104,14 +113,17 @@ public:
     /// serve as a comparison operation, by indicating whether the widened value
     /// is equivalent to the previous value.
     ///
-    /// Returns either:
-    ///
-    ///   `nullptr`, if this value is not of interest to the model, or
-    ///
-    ///   `&Prev`, if the widened value is equivalent to `Prev`, or
-    ///
-    ///   A non-null value that approximates `Current`. `Prev` is available to
-    ///   inform the chosen approximation.
+    /// Returns one of the folowing:
+    /// *  `std::nullopt`, if this value is not of interest to the
+    ///     model.
+    /// *  A `WidenResult` with:
+    ///    *  A non-null `Value *` that points either to `Current` or a widened
+    ///       version of `Current`. This value must be consistent with
+    ///       the flow condition of `CurrentEnv`. We particularly caution
+    ///       against using `Prev`, which is rarely consistent.
+    ///    *  A `LatticeEffect` indicating whether the value should be
+    ///       considered a new value (`Changed`) or one *equivalent* (if not
+    ///       necessarily equal) to `Prev` (`Unchanged`).
     ///
     /// `PrevEnv` and `CurrentEnv` can be used to query child values and path
     /// condition implications of `Prev` and `Current`, respectively.
@@ -122,17 +134,19 @@ public:
     ///
     ///  `Prev` and `Current` must be assigned to the same storage location in
     ///  `PrevEnv` and `CurrentEnv`, respectively.
-    virtual Value *widen(QualType Type, Value &Prev, const Environment &PrevEnv,
-                         Value &Current, Environment &CurrentEnv) {
+    virtual std::optional<WidenResult> widen(QualType Type, Value &Prev,
+                                             const Environment &PrevEnv,
+                                             Value &Current,
+                                             Environment &CurrentEnv) {
       // The default implementation reduces to just comparison, since comparison
       // is required by the API, even if no widening is performed.
       switch (compare(Type, Prev, PrevEnv, Current, CurrentEnv)) {
-        case ComparisonResult::Same:
-          return &Prev;
-        case ComparisonResult::Different:
-          return &Current;
-        case ComparisonResult::Unknown:
-          return nullptr;
+      case ComparisonResult::Unknown:
+        return std::nullopt;
+      case ComparisonResult::Same:
+        return WidenResult{&Current, LatticeEffect::Unchanged};
+      case ComparisonResult::Different:
+        return WidenResult{&Current, LatticeEffect::Changed};
       }
       llvm_unreachable("all cases in switch covered");
     }
@@ -236,8 +250,8 @@ public:
   ///
   ///  `PrevEnv` must be the immediate previous version of the environment.
   ///  `PrevEnv` and `this` must use the same `DataflowAnalysisContext`.
-  LatticeJoinEffect widen(const Environment &PrevEnv,
-                          Environment::ValueModel &Model);
+  LatticeEffect widen(const Environment &PrevEnv,
+                      Environment::ValueModel &Model);
 
   // FIXME: Rename `createOrGetStorageLocation` to `getOrCreateStorageLocation`,
   // `getStableStorageLocation`, or something more appropriate.
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowLattice.h b/clang/include/clang/Analysis/FlowSensitive/DataflowLattice.h
index 0c81e2f078c2..b262732804ed 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowLattice.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowLattice.h
@@ -17,13 +17,13 @@
 namespace clang {
 namespace dataflow {
 
-/// Effect indicating whether a lattice join operation resulted in a new value.
-// FIXME: Rename to `LatticeEffect` since `widen` uses it as well, and we are
-// likely removing it from `join`.
-enum class LatticeJoinEffect {
+/// Effect indicating whether a lattice operation resulted in a new value.
+enum class LatticeEffect {
   Unchanged,
   Changed,
 };
+// DEPRECATED. Use `LatticeEffect`.
+using LatticeJoinEffect = LatticeEffect;
 
 } // namespace dataflow
 } // namespace clang
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c3e90a70925b..12e8dc7912c3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -802,9 +802,11 @@ def B : JoinedOrSeparate<["-"], "B">, MetaVarName<"<prefix>">,
     HelpText<"Search $prefix$file for executables, libraries, and data files. "
     "If $prefix is a directory, search $prefix/$file">;
 def gcc_install_dir_EQ : Joined<["--"], "gcc-install-dir=">,
+  Visibility<[ClangOption, FlangOption]>,
   HelpText<"Use GCC installation in the specified directory. The directory ends with path components like 'lib{,32,64}/gcc{,-cross}/$triple/$version'. "
   "Note: executables (e.g. ld) used by the compiler are not overridden by the selected GCC installation">;
 def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[NoXarchOption]>,
+  Visibility<[ClangOption, FlangOption]>,
   HelpText<"Specify a directory where Clang can find 'include' and 'lib{,32,64}/gcc{,-cross}/$triple/$version'. "
   "Clang will use the GCC installation with the largest version">;
 def gcc_triple_EQ : Joined<["--"], "gcc-triple=">,
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index f729d676dd0d..70ac0764476f 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -166,17 +166,16 @@ static Value *joinDistinctValues(QualType Type, Value &Val1,
   return JoinedVal;
 }
 
-// When widening does not change `Current`, return value will equal `&Prev`.
-static Value &widenDistinctValues(QualType Type, Value &Prev,
-                                  const Environment &PrevEnv, Value &Current,
-                                  Environment &CurrentEnv,
-                                  Environment::ValueModel &Model) {
+static WidenResult widenDistinctValues(QualType Type, Value &Prev,
+                                       const Environment &PrevEnv,
+                                       Value &Current, Environment &CurrentEnv,
+                                       Environment::ValueModel &Model) {
   // Boolean-model widening.
   if (auto *PrevBool = dyn_cast<BoolValue>(&Prev)) {
-    // If previous value was already Top, re-use that to (implicitly) indicate
-    // that no change occurred.
     if (isa<TopBoolValue>(Prev))
-      return Prev;
+      // Safe to return `Prev` here, because Top is never dependent on the
+      // environment.
+      return {&Prev, LatticeEffect::Unchanged};
 
     // We may need to widen to Top, but before we do so, check whether both
     // values are implied to be either true or false in the current environment.
@@ -185,22 +184,24 @@ static Value &widenDistinctValues(QualType Type, Value &Prev,
     bool TruePrev = PrevEnv.proves(PrevBool->formula());
     bool TrueCur = CurrentEnv.proves(CurBool.formula());
     if (TruePrev && TrueCur)
-      return CurrentEnv.getBoolLiteralValue(true);
+      return {&CurrentEnv.getBoolLiteralValue(true), LatticeEffect::Unchanged};
     if (!TruePrev && !TrueCur &&
         PrevEnv.proves(PrevEnv.arena().makeNot(PrevBool->formula())) &&
         CurrentEnv.proves(CurrentEnv.arena().makeNot(CurBool.formula())))
-      return CurrentEnv.getBoolLiteralValue(false);
+      return {&CurrentEnv.getBoolLiteralValue(false), LatticeEffect::Unchanged};
 
-    return CurrentEnv.makeTopBoolValue();
+    return {&CurrentEnv.makeTopBoolValue(), LatticeEffect::Changed};
   }
 
   // FIXME: Add other built-in model widening.
 
   // Custom-model widening.
-  if (auto *W = Model.widen(Type, Prev, PrevEnv, Current, CurrentEnv))
-    return *W;
+  if (auto Result = Model.widen(Type, Prev, PrevEnv, Current, CurrentEnv))
+    return *Result;
 
-  return equateUnknownValues(Prev.getKind()) ? Prev : Current;
+  return {&Current, equateUnknownValues(Prev.getKind())
+                        ? LatticeEffect::Unchanged
+                        : LatticeEffect::Changed};
 }
 
 // Returns whether the values in `Map1` and `Map2` compare equal for those
@@ -271,7 +272,7 @@ llvm::MapVector<Key, Value *>
 widenKeyToValueMap(const llvm::MapVector<Key, Value *> &CurMap,
                    const llvm::MapVector<Key, Value *> &PrevMap,
                    Environment &CurEnv, const Environment &PrevEnv,
-                   Environment::ValueModel &Model, LatticeJoinEffect &Effect) {
+                   Environment::ValueModel &Model, LatticeEffect &Effect) {
   llvm::MapVector<Key, Value *> WidenedMap;
   for (auto &Entry : CurMap) {
     Key K = Entry.first;
@@ -290,11 +291,11 @@ widenKeyToValueMap(const llvm::MapVector<Key, Value *> &CurMap,
       continue;
     }
 
-    Value &WidenedVal = widenDistinctValues(K->getType(), *PrevIt->second,
-                                            PrevEnv, *Val, CurEnv, Model);
-    WidenedMap.insert({K, &WidenedVal});
-    if (&WidenedVal != PrevIt->second)
-      Effect = LatticeJoinEffect::Changed;
+    auto [WidenedVal, ValEffect] = widenDistinctValues(
+        K->getType(), *PrevIt->second, PrevEnv, *Val, CurEnv, Model);
+    WidenedMap.insert({K, WidenedVal});
+    if (ValEffect == LatticeEffect::Changed)
+      Effect = LatticeEffect::Changed;
   }
 
   return WidenedMap;
@@ -617,15 +618,15 @@ bool Environment::equivalentTo(const Environment &Other,
   return true;
 }
 
-LatticeJoinEffect Environment::widen(const Environment &PrevEnv,
-                                     Environment::ValueModel &Model) {
+LatticeEffect Environment::widen(const Environment &PrevEnv,
+                                 Environment::ValueModel &Model) {
   assert(DACtx == PrevEnv.DACtx);
   assert(ReturnVal == PrevEnv.ReturnVal);
   assert(ReturnLoc == PrevEnv.ReturnLoc);
   assert(ThisPointeeLoc == PrevEnv.ThisPointeeLoc);
   assert(CallStack == PrevEnv.CallStack);
 
-  auto Effect = LatticeJoinEffect::Unchanged;
+  auto Effect = LatticeEffect::Unchanged;
 
   // By the API, `PrevEnv` is a previous version of the environment for the same
   // block, so we have some guarantees about its shape. In particular, it will
@@ -646,7 +647,7 @@ LatticeJoinEffect Environment::widen(const Environment &PrevEnv,
       ExprToLoc.size() != PrevEnv.ExprToLoc.size() ||
       ExprToVal.size() != PrevEnv.ExprToVal.size() ||
       LocToVal.size() != PrevEnv.LocToVal.size())
-    Effect = LatticeJoinEffect::Changed;
+    Effect = LatticeEffect::Changed;
 
   return Effect;
 }
diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index c9dcf434c93b..0d6e4b4d0808 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -318,6 +318,7 @@ public:
     FPMode = isFP64Default() ? FP64 : FPXX;
     NoOddSpreg = false;
     bool OddSpregGiven = false;
+    bool StrictAlign = false;
 
     for (const auto &Feature : Features) {
       if (Feature == "+single-float")
@@ -330,6 +331,10 @@ public:
         IsMicromips = true;
       else if (Feature == "+mips32r6" || Feature == "+mips64r6")
         HasUnalignedAccess = true;
+      // We cannot be sure that the order of strict-align vs mips32r6.
+      // Thus we need an extra variable here.
+      else if (Feature == "+strict-align")
+        StrictAlign = true;
       else if (Feature == "+dsp")
         DspRev = std::max(DspRev, DSP1);
       else if (Feature == "+dspr2")
@@ -368,6 +373,9 @@ public:
     if (FPMode == FPXX && !OddSpregGiven)
       NoOddSpreg = true;
 
+    if (StrictAlign)
+      HasUnalignedAccess = false;
+
     setDataLayout();
 
     return true;
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index c8b2a93ae47a..e25a17658a34 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -100,9 +100,6 @@ using namespace llvm;
 namespace llvm {
 extern cl::opt<bool> PrintPipelinePasses;
 
-static cl::opt<bool> ClRemoveTraps("clang-remove-traps", cl::Optional,
-                                   cl::desc("Insert remove-traps pass."));
-
 // Experiment to move sanitizers earlier.
 static cl::opt<bool> ClSanitizeOnOptimizerEarlyEP(
     "sanitizer-early-opt-ep", cl::Optional,
@@ -750,7 +747,7 @@ static void addSanitizers(const Triple &TargetTriple,
     PB.registerOptimizerLastEPCallback(SanitizersCallback);
   }
 
-  if (ClRemoveTraps) {
+  if (RemoveTrapsPass::IsRequested()) {
     // We can optimize after inliner, and PGO profile matching. The hook below
     // is called at the end `buildFunctionSimplificationPipeline`, which called
     // from `buildInlinerPipeline`, which called after profile matching.
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index a01f2c7c9798..47f063b5501c 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -962,7 +962,7 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
       }
 
     // If it's a reference variable, copy the reference into the block field.
-    } else if (auto refType = type->getAs<ReferenceType>()) {
+    } else if (type->getAs<ReferenceType>()) {
       Builder.CreateStore(src.emitRawPointer(*this), blockField);
 
       // If type is const-qualified, copy the value into the block field.
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index a793b214645c..1facadd82f17 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -319,12 +319,12 @@ public:
     // doubles the exponent of SmallerType.LargestFiniteVal)
     if (llvm::APFloat::semanticsMaxExponent(ElementTypeSemantics) * 2 + 1 <=
         llvm::APFloat::semanticsMaxExponent(HigherElementTypeSemantics)) {
+      FPHasBeenPromoted = true;
       return CGF.getContext().getComplexType(HigherElementType);
     } else {
-      FPHasBeenPromoted = true;
       DiagnosticsEngine &Diags = CGF.CGM.getDiags();
       Diags.Report(diag::warn_next_larger_fp_type_same_size_than_fp);
-      return CGF.getContext().getComplexType(ElementType);
+      return QualType();
     }
   }
 
@@ -1037,7 +1037,7 @@ ComplexPairTy ComplexExprEmitter::EmitBinDiv(const BinOpInfo &Op) {
       LHSi = llvm::Constant::getNullValue(RHSi->getType());
     if (Op.FPFeatures.getComplexRange() == LangOptions::CX_Improved ||
         (Op.FPFeatures.getComplexRange() == LangOptions::CX_Promoted &&
-         FPHasBeenPromoted))
+         !FPHasBeenPromoted))
       return EmitRangeReductionDiv(LHSr, LHSi, RHSr, RHSi);
     else if (Op.FPFeatures.getComplexRange() == LangOptions::CX_Basic ||
              Op.FPFeatures.getComplexRange() == LangOptions::CX_Promoted)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index bc363313dec6..8eb10584699f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2648,9 +2648,9 @@ void CGOpenMPRuntime::emitDistributeStaticInit(
 void CGOpenMPRuntime::emitForStaticFinish(CodeGenFunction &CGF,
                                           SourceLocation Loc,
                                           OpenMPDirectiveKind DKind) {
-  assert(DKind == OMPD_distribute || DKind == OMPD_for ||
-         DKind == OMPD_sections &&
-             "Expected distribute, for, or sections directive kind");
+  assert((DKind == OMPD_distribute || DKind == OMPD_for ||
+          DKind == OMPD_sections) &&
+         "Expected distribute, for, or sections directive kind");
   if (!CGF.HaveInsertPoint())
     return;
   // Call __kmpc_for_static_fini(ident_t *loc, kmp_int32 tid);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b7ec7e0a6097..766a9b91e3c0 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5882,7 +5882,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CM = "large";
     if (Triple.isAArch64(64)) {
       Ok = CM == "tiny" || CM == "small" || CM == "large";
-      if (CM == "large" && RelocationModel != llvm::Reloc::Static)
+      if (CM == "large" && !Triple.isOSBinFormatMachO() &&
+          RelocationModel != llvm::Reloc::Static)
         D.Diag(diag::err_drv_argument_only_allowed_with)
             << A->getAsString(Args) << "-fno-pic";
     } else if (Triple.isLoongArch()) {
diff --git a/clang/lib/Headers/__stddef_unreachable.h b/clang/lib/Headers/__stddef_unreachable.h
index 518580c92d3f..61df43e9732f 100644
--- a/clang/lib/Headers/__stddef_unreachable.h
+++ b/clang/lib/Headers/__stddef_unreachable.h
@@ -7,6 +7,8 @@
  *===-----------------------------------------------------------------------===
  */
 
+#ifndef __cplusplus
+
 /*
  * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
  * and needs to behave as if it was textual.
@@ -15,3 +17,5 @@
     (__has_feature(modules) && !__building_module(_Builtin_stddef))
 #define unreachable() __builtin_unreachable()
 #endif
+
+#endif
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index a2b8cc14ca76..d3def13f495d 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2711,7 +2711,6 @@ SmallVector<unsigned> TemplateParamsReferencedInTemplateArgumentList(
         : TemplateParams(TemplateParams.begin(), TemplateParams.end()) {}
 
     bool VisitTemplateTypeParmType(TemplateTypeParmType *TTP) {
-      TTP->getIndex();
       MarkAppeared(TTP->getDecl());
       return true;
     }
diff --git a/clang/test/CodeGen/cx-complex-range.c b/clang/test/CodeGen/cx-complex-range.c
index 9ec80252085b..38f99230eea5 100644
--- a/clang/test/CodeGen/cx-complex-range.c
+++ b/clang/test/CodeGen/cx-complex-range.c
@@ -48,6 +48,15 @@
 // RUN: -ffast-math -complex-range=promoted -emit-llvm -o - %s \
 // RUN: | FileCheck %s --check-prefix=PRMTD_FAST
 
+// strict math mode
+// RUN: %clang_cc1 -triple x86_64-windows-pc -complex-range=promoted \
+// RUN: -ffp-contract=off -frounding-math -ffp-exception-behavior=strict \
+// RUN: -emit-llvm -o - %s | FileCheck %s --check-prefix=X86WINPRMTD_STRICT
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -complex-range=promoted \
+// RUN: -ffp-contract=off -frounding-math -ffp-exception-behavior=strict \
+// RUN: -emit-llvm -o - %s | FileCheck %s --check-prefix=PRMTD_STRICT
+
 // FULL-LABEL: define dso_local <2 x float> @divf(
 // FULL-SAME: <2 x float> noundef [[A_COERCE:%.*]], <2 x float> noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
 // FULL-NEXT:  entry:
@@ -504,6 +513,86 @@
 // PRMTD_FAST-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
 // PRMTD_FAST-NEXT:    ret <2 x float> [[TMP11]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local i64 @divf(
+// X86WINPRMTD_STRICT-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[A_COERCE]], ptr [[A]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[B_COERCE]], ptr [[B]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[EXT:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_REAL]], metadata !"fpexcept.strict") #[[ATTR3:[0-9]+]]
+// X86WINPRMTD_STRICT-NEXT:    [[EXT1:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_IMAG]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[EXT2:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[B_REAL]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[EXT3:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[B_IMAG]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT1]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP0]], double [[TMP1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT2]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT3]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP3]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT1]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP6]], double [[TMP7]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP2]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP8]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[UNPROMOTION:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP9]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[UNPROMOTION4:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store float [[UNPROMOTION]], ptr [[RETVAL_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store float [[UNPROMOTION4]], ptr [[RETVAL_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[TMP11:%.*]] = load i64, ptr [[RETVAL]], align 4
+// X86WINPRMTD_STRICT-NEXT:    ret i64 [[TMP11]]
+//
+// PRMTD_STRICT-LABEL: define dso_local <2 x float> @divf(
+// PRMTD_STRICT-SAME: <2 x float> noundef [[A_COERCE:%.*]], <2 x float> noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[A_COERCE]], ptr [[A]], align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[B_COERCE]], ptr [[B]], align 4
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[EXT:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_REAL]], metadata !"fpexcept.strict") #[[ATTR4:[0-9]+]]
+// PRMTD_STRICT-NEXT:    [[EXT1:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[EXT2:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[B_REAL]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT3:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[B_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT1]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP0]], double [[TMP1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT2]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT3]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP3]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT1]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP6]], double [[TMP7]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP2]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP8]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP9]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION4:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store float [[UNPROMOTION]], ptr [[RETVAL_REALP]], align 4
+// PRMTD_STRICT-NEXT:    store float [[UNPROMOTION4]], ptr [[RETVAL_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
+// PRMTD_STRICT-NEXT:    ret <2 x float> [[TMP11]]
+//
 _Complex float divf(_Complex float a, _Complex float b) {
   return a / b;
 }
@@ -873,6 +962,64 @@ _Complex float divf(_Complex float a, _Complex float b) {
 // PRMTD_FAST-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
 // PRMTD_FAST-NEXT:    ret <2 x float> [[TMP0]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local i64 @mulf(
+// X86WINPRMTD_STRICT-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[A_COERCE]], ptr [[A]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[B_COERCE]], ptr [[B]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_REAL]], float [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_IMAG]], float [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_REAL]], float [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_IMAG]], float [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[MUL_AC]], float [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[MUL_AD]], float [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store float [[MUL_R]], ptr [[RETVAL_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store float [[MUL_I]], ptr [[RETVAL_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = load i64, ptr [[RETVAL]], align 4
+// X86WINPRMTD_STRICT-NEXT:    ret i64 [[TMP0]]
+//
+// PRMTD_STRICT-LABEL: define dso_local <2 x float> @mulf(
+// PRMTD_STRICT-SAME: <2 x float> noundef [[A_COERCE:%.*]], <2 x float> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[A_COERCE]], ptr [[A]], align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[B_COERCE]], ptr [[B]], align 4
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_REAL]], float [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_IMAG]], float [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_REAL]], float [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_IMAG]], float [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[MUL_AC]], float [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[MUL_AD]], float [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store float [[MUL_R]], ptr [[RETVAL_REALP]], align 4
+// PRMTD_STRICT-NEXT:    store float [[MUL_I]], ptr [[RETVAL_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
+// PRMTD_STRICT-NEXT:    ret <2 x float> [[TMP0]]
+//
 _Complex float mulf(_Complex float a, _Complex float b) {
   return a * b;
 }
@@ -1411,6 +1558,112 @@ _Complex float mulf(_Complex float a, _Complex float b) {
 // PRMTD_FAST-NEXT:    [[TMP15:%.*]] = load { double, double }, ptr [[RETVAL]], align 8
 // PRMTD_FAST-NEXT:    ret { double, double } [[TMP15]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local void @divd(
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[B]], ptr [[B_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[A]], ptr [[A_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call double @llvm.fabs.f64(double [[B_REAL]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call double @llvm.fabs.f64(double [[B_IMAG]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[ABS_CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP0]], double [[TMP1]], metadata !"ugt", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_greater_or_equal_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[B_IMAG]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP2]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[B_REAL]], double [[TMP3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[A_REAL]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP6]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[A_IMAG]], double [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP9]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_less_than_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[B_REAL]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP11]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[B_IMAG]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[A_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP16:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP15]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP17:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP18:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP17]], double [[A_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP19:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP18]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV]]
+// X86WINPRMTD_STRICT:       complex_div:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP20:%.*]] = phi double [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP21:%.*]] = phi double [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[TMP20]], ptr [[AGG_RESULT_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[TMP21]], ptr [[AGG_RESULT_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP1:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REAL:%.*]] = load double, ptr [[AGG_RESULT_REALP1]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP2:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAG:%.*]] = load double, ptr [[AGG_RESULT_IMAGP2]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP3:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP4:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_REAL]], ptr [[AGG_RESULT_REALP3]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_IMAG]], ptr [[AGG_RESULT_IMAGP4]], align 8
+// X86WINPRMTD_STRICT-NEXT:    ret void
+//
+// PRMTD_STRICT-LABEL: define dso_local { double, double } @divd(
+// PRMTD_STRICT-SAME: double noundef [[A_COERCE0:%.*]], double noundef [[A_COERCE1:%.*]], double noundef [[B_COERCE0:%.*]], double noundef [[B_COERCE1:%.*]]) #[[ATTR2:[0-9]+]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    store double [[A_COERCE0]], ptr [[TMP0]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[A_COERCE1]], ptr [[TMP1]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    store double [[B_COERCE0]], ptr [[TMP2]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[B_COERCE1]], ptr [[TMP3]], align 8
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[EXT:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double [[A_REAL]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT1:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double [[A_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[EXT2:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double [[B_REAL]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT3:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double [[B_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT]], x86_fp80 [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT1]], x86_fp80 [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[TMP4]], x86_fp80 [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT2]], x86_fp80 [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT3]], x86_fp80 [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[TMP7]], x86_fp80 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT1]], x86_fp80 [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT]], x86_fp80 [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[TMP10]], x86_fp80 [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP6]], x86_fp80 [[TMP9]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP12]], x86_fp80 [[TMP9]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION:%.*]] = call double @llvm.experimental.constrained.fptrunc.f64.f80(x86_fp80 [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION4:%.*]] = call double @llvm.experimental.constrained.fptrunc.f64.f80(x86_fp80 [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[UNPROMOTION]], ptr [[RETVAL_REALP]], align 8
+// PRMTD_STRICT-NEXT:    store double [[UNPROMOTION4]], ptr [[RETVAL_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP15:%.*]] = load { double, double }, ptr [[RETVAL]], align 8
+// PRMTD_STRICT-NEXT:    ret { double, double } [[TMP15]]
+//
 _Complex double divd(_Complex double a, _Complex double b) {
   return a / b;
 }
@@ -1834,6 +2087,78 @@ _Complex double divd(_Complex double a, _Complex double b) {
 // PRMTD_FAST-NEXT:    [[TMP4:%.*]] = load { double, double }, ptr [[RETVAL]], align 8
 // PRMTD_FAST-NEXT:    ret { double, double } [[TMP4]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local void @muld(
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[B]], ptr [[B_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[A]], ptr [[A_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[MUL_AC]], double [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[MUL_AD]], double [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[MUL_R]], ptr [[AGG_RESULT_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[MUL_I]], ptr [[AGG_RESULT_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP1:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REAL:%.*]] = load double, ptr [[AGG_RESULT_REALP1]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP2:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAG:%.*]] = load double, ptr [[AGG_RESULT_IMAGP2]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP3:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP4:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_REAL]], ptr [[AGG_RESULT_REALP3]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_IMAG]], ptr [[AGG_RESULT_IMAGP4]], align 8
+// X86WINPRMTD_STRICT-NEXT:    ret void
+//
+// PRMTD_STRICT-LABEL: define dso_local { double, double } @muld(
+// PRMTD_STRICT-SAME: double noundef [[A_COERCE0:%.*]], double noundef [[A_COERCE1:%.*]], double noundef [[B_COERCE0:%.*]], double noundef [[B_COERCE1:%.*]]) #[[ATTR2]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    store double [[A_COERCE0]], ptr [[TMP0]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[A_COERCE1]], ptr [[TMP1]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    store double [[B_COERCE0]], ptr [[TMP2]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[B_COERCE1]], ptr [[TMP3]], align 8
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[MUL_AC]], double [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[MUL_AD]], double [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[MUL_R]], ptr [[RETVAL_REALP]], align 8
+// PRMTD_STRICT-NEXT:    store double [[MUL_I]], ptr [[RETVAL_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP4:%.*]] = load { double, double }, ptr [[RETVAL]], align 8
+// PRMTD_STRICT-NEXT:    ret { double, double } [[TMP4]]
+//
 _Complex double muld(_Complex double a, _Complex double b) {
   return a * b;
 }
@@ -2316,6 +2641,114 @@ _Complex double muld(_Complex double a, _Complex double b) {
 // PRMTD_FAST-NEXT:    [[TMP22:%.*]] = load { x86_fp80, x86_fp80 }, ptr [[RETVAL]], align 16
 // PRMTD_FAST-NEXT:    ret { x86_fp80, x86_fp80 } [[TMP22]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local void @divld(
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[B]], ptr [[B_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[A]], ptr [[A_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call double @llvm.fabs.f64(double [[B_REAL]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call double @llvm.fabs.f64(double [[B_IMAG]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[ABS_CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP0]], double [[TMP1]], metadata !"ugt", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_greater_or_equal_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[B_IMAG]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP2]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[B_REAL]], double [[TMP3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[A_REAL]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP6]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[A_IMAG]], double [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP9]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_less_than_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[B_REAL]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP11]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[B_IMAG]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[A_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP16:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP15]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP17:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP18:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP17]], double [[A_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP19:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP18]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV]]
+// X86WINPRMTD_STRICT:       complex_div:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP20:%.*]] = phi double [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP21:%.*]] = phi double [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[TMP20]], ptr [[AGG_RESULT_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[TMP21]], ptr [[AGG_RESULT_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP1:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REAL:%.*]] = load double, ptr [[AGG_RESULT_REALP1]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP2:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAG:%.*]] = load double, ptr [[AGG_RESULT_IMAGP2]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP3:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP4:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_REAL]], ptr [[AGG_RESULT_REALP3]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_IMAG]], ptr [[AGG_RESULT_IMAGP4]], align 8
+// X86WINPRMTD_STRICT-NEXT:    ret void
+//
+// PRMTD_STRICT-LABEL: define dso_local { x86_fp80, x86_fp80 } @divld(
+// PRMTD_STRICT-SAME: ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 [[A:%.*]], ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 [[B:%.*]]) #[[ATTR2]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { x86_fp80, x86_fp80 }, align 16
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load x86_fp80, ptr [[A_REALP]], align 16
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load x86_fp80, ptr [[A_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load x86_fp80, ptr [[B_REALP]], align 16
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load x86_fp80, ptr [[B_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 [[B_REAL]]) #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 [[B_IMAG]]) #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[ABS_CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f80(x86_fp80 [[TMP0]], x86_fp80 [[TMP1]], metadata !"ugt", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// PRMTD_STRICT:       abs_rhsr_greater_or_equal_abs_rhsi:
+// PRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[B_IMAG]], x86_fp80 [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[TMP2]], x86_fp80 [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[B_REAL]], x86_fp80 [[TMP3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_IMAG]], x86_fp80 [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[A_REAL]], x86_fp80 [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP6]], x86_fp80 [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_REAL]], x86_fp80 [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[A_IMAG]], x86_fp80 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP9]], x86_fp80 [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// PRMTD_STRICT:       abs_rhsr_less_than_abs_rhsi:
+// PRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[B_REAL]], x86_fp80 [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[TMP11]], x86_fp80 [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[B_IMAG]], x86_fp80 [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_REAL]], x86_fp80 [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP15:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[TMP14]], x86_fp80 [[A_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP16:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP15]], x86_fp80 [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP17:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_IMAG]], x86_fp80 [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP18:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[TMP17]], x86_fp80 [[A_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP19:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP18]], x86_fp80 [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV]]
+// PRMTD_STRICT:       complex_div:
+// PRMTD_STRICT-NEXT:    [[TMP20:%.*]] = phi x86_fp80 [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// PRMTD_STRICT-NEXT:    [[TMP21:%.*]] = phi x86_fp80 [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store x86_fp80 [[TMP20]], ptr [[RETVAL_REALP]], align 16
+// PRMTD_STRICT-NEXT:    store x86_fp80 [[TMP21]], ptr [[RETVAL_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[TMP22:%.*]] = load { x86_fp80, x86_fp80 }, ptr [[RETVAL]], align 16
+// PRMTD_STRICT-NEXT:    ret { x86_fp80, x86_fp80 } [[TMP22]]
+//
 _Complex long double divld(_Complex long double a, _Complex long double b) {
   return a / b;
 }
@@ -2659,6 +3092,68 @@ _Complex long double divld(_Complex long double a, _Complex long double b) {
 // PRMTD_FAST-NEXT:    [[TMP0:%.*]] = load { x86_fp80, x86_fp80 }, ptr [[RETVAL]], align 16
 // PRMTD_FAST-NEXT:    ret { x86_fp80, x86_fp80 } [[TMP0]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local void @mulld(
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[B]], ptr [[B_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[A]], ptr [[A_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[MUL_AC]], double [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[MUL_AD]], double [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[MUL_R]], ptr [[AGG_RESULT_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[MUL_I]], ptr [[AGG_RESULT_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP1:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REAL:%.*]] = load double, ptr [[AGG_RESULT_REALP1]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP2:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAG:%.*]] = load double, ptr [[AGG_RESULT_IMAGP2]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP3:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP4:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_REAL]], ptr [[AGG_RESULT_REALP3]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_IMAG]], ptr [[AGG_RESULT_IMAGP4]], align 8
+// X86WINPRMTD_STRICT-NEXT:    ret void
+//
+// PRMTD_STRICT-LABEL: define dso_local { x86_fp80, x86_fp80 } @mulld(
+// PRMTD_STRICT-SAME: ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 [[A:%.*]], ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 [[B:%.*]]) #[[ATTR2]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { x86_fp80, x86_fp80 }, align 16
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load x86_fp80, ptr [[A_REALP]], align 16
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load x86_fp80, ptr [[A_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load x86_fp80, ptr [[B_REALP]], align 16
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load x86_fp80, ptr [[B_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_REAL]], x86_fp80 [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_IMAG]], x86_fp80 [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_REAL]], x86_fp80 [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_IMAG]], x86_fp80 [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[MUL_AC]], x86_fp80 [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[MUL_AD]], x86_fp80 [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store x86_fp80 [[MUL_R]], ptr [[RETVAL_REALP]], align 16
+// PRMTD_STRICT-NEXT:    store x86_fp80 [[MUL_I]], ptr [[RETVAL_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = load { x86_fp80, x86_fp80 }, ptr [[RETVAL]], align 16
+// PRMTD_STRICT-NEXT:    ret { x86_fp80, x86_fp80 } [[TMP0]]
+//
 _Complex long double mulld(_Complex long double a, _Complex long double b) {
   return a * b;
 }
@@ -3446,6 +3941,167 @@ _Complex long double mulld(_Complex long double a, _Complex long double b) {
 // PRMTD_FAST-NEXT:    [[TMP33:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
 // PRMTD_FAST-NEXT:    ret <2 x float> [[TMP33]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local i64 @f1(
+// X86WINPRMTD_STRICT-SAME: i64 noundef [[A_COERCE:%.*]], ptr noundef [[B:%.*]], i64 noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[C:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[A_COERCE]], ptr [[A]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[C_COERCE]], ptr [[C]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[B]], ptr [[B_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[C_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[C]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[C_REAL:%.*]] = load float, ptr [[C_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[C_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[C]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[C_IMAG:%.*]] = load float, ptr [[C_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[CONV:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[C_REAL]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[CONV1:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[C_IMAG]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call double @llvm.fabs.f64(double [[CONV]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call double @llvm.fabs.f64(double [[CONV1]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[ABS_CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP0]], double [[TMP1]], metadata !"ugt", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_greater_or_equal_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[CONV1]], double [[CONV]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP2]], double [[CONV1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[CONV]], double [[TMP3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[B_IMAG]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[B_REAL]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP6]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[B_REAL]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[B_IMAG]], double [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP9]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_less_than_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[CONV]], double [[CONV1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP11]], double [[CONV]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[CONV1]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[B_REAL]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP16:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP15]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP17:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[B_IMAG]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP18:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP17]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP19:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP18]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV]]
+// X86WINPRMTD_STRICT:       complex_div:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP20:%.*]] = phi double [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP21:%.*]] = phi double [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[CONV2:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[CONV3:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[EXT:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[CONV2]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[EXT4:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[CONV3]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[EXT5:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_REAL]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[EXT6:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_IMAG]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP22:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP23:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT4]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP24:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP22]], double [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP25:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT5]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP26:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT6]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP27:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP25]], double [[TMP26]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT4]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP29:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP30:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP28]], double [[TMP29]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP31:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP24]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP32:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP30]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[UNPROMOTION:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[UNPROMOTION7:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP32]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store float [[UNPROMOTION]], ptr [[RETVAL_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store float [[UNPROMOTION7]], ptr [[RETVAL_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[TMP33:%.*]] = load i64, ptr [[RETVAL]], align 4
+// X86WINPRMTD_STRICT-NEXT:    ret i64 [[TMP33]]
+//
+// PRMTD_STRICT-LABEL: define dso_local <2 x float> @f1(
+// PRMTD_STRICT-SAME: <2 x float> noundef [[A_COERCE:%.*]], ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 [[B:%.*]], <2 x float> noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[C:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[A_COERCE]], ptr [[A]], align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[C_COERCE]], ptr [[C]], align 4
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load x86_fp80, ptr [[B_REALP]], align 16
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load x86_fp80, ptr [[B_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[C_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[C]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[C_REAL:%.*]] = load float, ptr [[C_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[C_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[C]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[C_IMAG:%.*]] = load float, ptr [[C_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[CONV:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f32(float [[C_REAL]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[CONV1:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f32(float [[C_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 [[CONV]]) #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 [[CONV1]]) #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[ABS_CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f80(x86_fp80 [[TMP0]], x86_fp80 [[TMP1]], metadata !"ugt", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// PRMTD_STRICT:       abs_rhsr_greater_or_equal_abs_rhsi:
+// PRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[CONV1]], x86_fp80 [[CONV]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[TMP2]], x86_fp80 [[CONV1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[CONV]], x86_fp80 [[TMP3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[B_IMAG]], x86_fp80 [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[B_REAL]], x86_fp80 [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP6]], x86_fp80 [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[B_REAL]], x86_fp80 [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[B_IMAG]], x86_fp80 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP9]], x86_fp80 [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// PRMTD_STRICT:       abs_rhsr_less_than_abs_rhsi:
+// PRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[CONV]], x86_fp80 [[CONV1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[TMP11]], x86_fp80 [[CONV]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[CONV1]], x86_fp80 [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[B_REAL]], x86_fp80 [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP15:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[TMP14]], x86_fp80 [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP16:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP15]], x86_fp80 [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP17:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[B_IMAG]], x86_fp80 [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP18:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[TMP17]], x86_fp80 [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP19:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP18]], x86_fp80 [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV]]
+// PRMTD_STRICT:       complex_div:
+// PRMTD_STRICT-NEXT:    [[TMP20:%.*]] = phi x86_fp80 [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// PRMTD_STRICT-NEXT:    [[TMP21:%.*]] = phi x86_fp80 [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// PRMTD_STRICT-NEXT:    [[CONV2:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f80(x86_fp80 [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[CONV3:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f80(x86_fp80 [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[CONV2]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT4:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[CONV3]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[EXT5:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_REAL]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT6:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP22:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP23:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT4]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP24:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP22]], double [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP25:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT5]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP26:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT6]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP27:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP25]], double [[TMP26]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT4]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP29:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP30:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP28]], double [[TMP29]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP31:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP24]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP32:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP30]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION7:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP32]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store float [[UNPROMOTION]], ptr [[RETVAL_REALP]], align 4
+// PRMTD_STRICT-NEXT:    store float [[UNPROMOTION7]], ptr [[RETVAL_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[TMP33:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
+// PRMTD_STRICT-NEXT:    ret <2 x float> [[TMP33]]
+//
 _Complex float f1(_Complex float a, _Complex long double b, _Complex float c) {
   return (_Complex float)(b / c) / a;
 }
+//.
+// FULL: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+//.
+// FULL_FAST: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+//.
diff --git a/clang/test/CodeGen/remote-traps.c b/clang/test/CodeGen/remote-traps.c
deleted file mode 100644
index 6751afb96d25..000000000000
--- a/clang/test/CodeGen/remote-traps.c
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: %clang_cc1 -O1 -emit-llvm -fsanitize=signed-integer-overflow -fsanitize-trap=signed-integer-overflow %s -o - | FileCheck %s 
-// RUN: %clang_cc1 -O1 -emit-llvm -fsanitize=signed-integer-overflow -fsanitize-trap=signed-integer-overflow -mllvm -clang-remove-traps -mllvm -remove-traps-random-rate=1 %s -o - | FileCheck %s --implicit-check-not="call void @llvm.ubsantrap" --check-prefixes=REMOVE
-
-int test(int x) {
-  return x + 123;
-}
-
-// CHECK-LABEL: define {{.*}}i32 @test(
-// CHECK: call { i32, i1 } @llvm.sadd.with.overflow.i32(
-// CHECK: trap:
-// CHECK-NEXT: call void @llvm.ubsantrap(i8 0)
-// CHECK-NEXT: unreachable
-
-// REMOVE-LABEL: define {{.*}}i32 @test(
-// REMOVE: call { i32, i1 } @llvm.sadd.with.overflow.i32(
diff --git a/clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp b/clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp
new file mode 100644
index 000000000000..80884b49ddc6
--- /dev/null
+++ b/clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple aarch64_be-apple-darwin -emit-llvm -o - -O1 %s | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-BE %s
+// RUN: %clang_cc1 -triple aarch64-apple-darwin -emit-llvm -o - -O1 %s | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-LE %s
+//
+// Check that TBAA metadata for structs containing bitfields is
+// consistent between big and little endian layouts.
+//
+// FIXME: The metadata below is invalid for the big endian layout: the
+// start offset of 2 is incorrect.
+
+struct NamedBitfields {
+  int f1 : 8;
+  int f2 : 8;
+  unsigned f3 : 1;
+  unsigned f4 : 15;
+  int f5;
+  double f6;
+};
+
+// CHECK-LABEL: _Z4copyP14NamedBitfieldsS0_
+// CHECK-SAME: ptr nocapture noundef writeonly [[A1:%.*]], ptr nocapture noundef readonly [[A2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[A1]], ptr noundef nonnull align 8 dereferenceable(16) [[A2]], i64 16, i1 false), !tbaa.struct [[TBAA_STRUCT2:![0-9]+]]
+// CHECK-NEXT:    ret void
+//
+void copy(NamedBitfields *a1, NamedBitfields *a2) {
+  *a1 = *a2;
+}
+
+// CHECK-BE: [[TBAA_STRUCT2]] = !{i64 2, i64 4, [[META3:![0-9]+]], i64 4, i64 4, [[META6:![0-9]+]], i64 8, i64 8, [[META8:![0-9]+]]}
+// CHECK-LE: [[TBAA_STRUCT2]] = !{i64 0, i64 4, [[META3:![0-9]+]], i64 4, i64 4, [[META6:![0-9]+]], i64 8, i64 8, [[META8:![0-9]+]]}
+// CHECK: [[META3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK: [[META6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"int", [[META4]], i64 0}
+// CHECK: [[META8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK: [[META9]] = !{!"double", [[META4]], i64 0}
diff --git a/clang/test/Driver/mcmodel.c b/clang/test/Driver/mcmodel.c
index 1eb6ae16ff47..9681c32579d7 100644
--- a/clang/test/Driver/mcmodel.c
+++ b/clang/test/Driver/mcmodel.c
@@ -11,6 +11,7 @@
 // RUN: FileCheck --check-prefix=AIX-MCMEDIUM-OVERRIDE %s < %t.log
 // RUN: not %clang -### -c -mcmodel=lager %s 2>&1 | FileCheck --check-prefix=INVALID %s
 // RUN: %clang --target=aarch64 -### -S -mcmodel=large -fno-pic %s 2>&1 | FileCheck --check-prefix=LARGE %s
+// RUN: %clang --target=aarch64-apple-macosx -### -S -mcmodel=large %s 2>&1 | FileCheck --check-prefix=LARGE %s
 // RUN: not %clang --target=aarch64 -### -S -mcmodel=large -fpic %s 2>&1 | FileCheck --check-prefix=AARCH64-PIC-LARGE %s
 // RUN: not %clang -### -c --target=aarch64 -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=ERR-MEDIUM %s
 // RUN: not %clang -### -c --target=aarch64 -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=ERR-KERNEL %s
diff --git a/clang/test/InstallAPI/driver-invalid-options.test b/clang/test/InstallAPI/driver-invalid-options.test
index 69f3b2d66ab8..0c630eacd183 100644
--- a/clang/test/InstallAPI/driver-invalid-options.test
+++ b/clang/test/InstallAPI/driver-invalid-options.test
@@ -7,3 +7,9 @@
 // RUN: not clang-installapi -target x86_64-apple-ios-simulator  %s -o tmp.tbd 2> %t 
 // RUN: FileCheck --check-prefix INVALID_INSTALL_NAME -input-file %t %s
 // INVALID_INSTALL_NAME: error: no install name specified: add -install_name <path>
+
+/// Check invalid verification mode.
+// RUN: not clang-installapi -install_name Foo -target arm64-apple-ios13 \
+// RUN: --verify-mode=Invalid -o tmp.tbd 2> %t
+// RUN: FileCheck --check-prefix INVALID_VERIFY_MODE -input-file %t %s 
+// INVALID_VERIFY_MODE: error: invalid value 'Invalid' in '--verify-mode=Invalid'
diff --git a/clang/test/Interpreter/inline-asm.cpp b/clang/test/Interpreter/inline-asm.cpp
new file mode 100644
index 000000000000..f94f14df72f8
--- /dev/null
+++ b/clang/test/Interpreter/inline-asm.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: host-supports-jit, x86_64-linux
+// UNSUPPORTED: system-aix
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: cat %t/inline-asm.txt | clang-repl -Xcc="-I%t"
+
+//--- inline-asm.cpp
+__asm(".globl _ZSt21ios_base_library_initv");
+int x;
+
+//--- inline-asm.txt
+#include "inline-asm.cpp"
+x = 10;
+%quit
diff --git a/clang/tools/clang-repl/ClangRepl.cpp b/clang/tools/clang-repl/ClangRepl.cpp
index 5bad8145324d..aecf61b97fc7 100644
--- a/clang/tools/clang-repl/ClangRepl.cpp
+++ b/clang/tools/clang-repl/ClangRepl.cpp
@@ -152,6 +152,7 @@ int main(int argc, const char **argv) {
   llvm::InitializeAllTargets();
   llvm::InitializeAllTargetMCs();
   llvm::InitializeAllAsmPrinters();
+  llvm::InitializeAllAsmParsers();
 
   if (OptHostSupportsJit) {
     auto J = llvm::orc::LLJITBuilder().create();
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index bea00ab1a1f0..b0b579d2bc19 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -805,6 +805,25 @@ public:
     else
       JoinedVal.setProperty("is_null", JoinedEnv.makeTopBoolValue());
   }
+
+  std::optional<WidenResult> widen(QualType Type, Value &Prev,
+                                   const Environment &PrevEnv, Value &Current,
+                                   Environment &CurrentEnv) override {
+    switch (compare(Type, Prev, PrevEnv, Current, CurrentEnv)) {
+    case ComparisonResult::Same:
+      return WidenResult{&Current, LatticeJoinEffect::Unchanged};
+    case ComparisonResult::Different: {
+      auto &CurPtr = cast<PointerValue>(Current);
+      auto &WidenedPtr =
+          CurrentEnv.create<PointerValue>(CurPtr.getPointeeLoc());
+      WidenedPtr.setProperty("is_null", CurrentEnv.makeTopBoolValue());
+      return WidenResult{&WidenedPtr, LatticeJoinEffect::Changed};
+    }
+    case ComparisonResult::Unknown:
+      return std::nullopt;
+    }
+    llvm_unreachable("all cases in switch covered");
+  }
 };
 
 class WideningTest : public Test {
@@ -846,7 +865,6 @@ TEST_F(WideningTest, JoinDistinctValuesWithDistinctProperties) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p1", "p2", "p3"));
         const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
         const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
         const Environment &Env3 = getEnvironmentAtAnnotation(Results, "p3");
@@ -889,8 +907,6 @@ TEST_F(WideningTest, JoinDistinctValuesWithSameProperties) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(),
-                    UnorderedElementsAre("p1", "p2", "p3", "p4"));
         const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
         const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
         const Environment &Env3 = getEnvironmentAtAnnotation(Results, "p3");
@@ -929,19 +945,11 @@ TEST_F(WideningTest, DistinctPointersToTheSameLocationAreEquivalent) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
-
-        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
-        ASSERT_THAT(FooDecl, NotNull());
-
-        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
-        ASSERT_THAT(BarDecl, NotNull());
-
-        const auto *FooLoc =
-            cast<ScalarStorageLocation>(Env.getStorageLocation(*FooDecl));
-        const auto *BarVal = cast<PointerValue>(Env.getValue(*BarDecl));
-        EXPECT_EQ(&BarVal->getPointeeLoc(), FooLoc);
+        const auto &FooLoc =
+            getLocForDecl<ScalarStorageLocation>(ASTCtx, Env, "Foo");
+        const auto &BarVal = getValueForDecl<PointerValue>(ASTCtx, Env, "Bar");
+        EXPECT_EQ(&BarVal.getPointeeLoc(), &FooLoc);
       });
 }
 
@@ -963,18 +971,38 @@ TEST_F(WideningTest, DistinctValuesWithSamePropertiesAreEquivalent) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
-
-        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
-        ASSERT_THAT(FooDecl, NotNull());
-
-        const auto *FooVal = Env.getValue(*FooDecl);
-        EXPECT_EQ(FooVal->getProperty("is_null"),
+        const auto &FooVal = getValueForDecl<Value>(ASTCtx, Env, "Foo");
+        EXPECT_EQ(FooVal.getProperty("is_null"),
                   &Env.getBoolLiteralValue(false));
       });
 }
 
+TEST_F(WideningTest, DistinctValuesWithDifferentPropertiesWidenedToTop) {
+  std::string Code = R"(
+    void target(bool Cond) {
+      int *Foo;
+      int i = 0;
+      Foo = nullptr;
+      while (Cond) {
+        Foo = &i;
+      }
+      (void)0;
+      /*[[p]]*/
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+        const auto &FooVal = getValueForDecl<Value>(ASTCtx, Env, "Foo");
+        ASSERT_THAT(FooVal.getProperty("is_null"), NotNull());
+        EXPECT_TRUE(areEquivalentValues(*FooVal.getProperty("is_null"),
+                                        Env.makeTopBoolValue()));
+      });
+}
+
 class FlowConditionTest : public Test {
 protected:
   template <typename Matcher>
diff --git a/compiler-rt/lib/hwasan/hwasan_thread_list.h b/compiler-rt/lib/hwasan/hwasan_thread_list.h
index d0eebd1b373a..369a1c3d6f5f 100644
--- a/compiler-rt/lib/hwasan/hwasan_thread_list.h
+++ b/compiler-rt/lib/hwasan/hwasan_thread_list.h
@@ -18,7 +18,7 @@
 // * Start of the shadow memory region is aligned to 2**kShadowBaseAlignment.
 // * All stack ring buffers are located within (2**kShadowBaseAlignment)
 // sized region below and adjacent to the shadow region.
-// * Each ring buffer has a size of (2**N)*4096 where N is in [0, 8), and is
+// * Each ring buffer has a size of (2**N)*4096 where N is in [0, 7), and is
 // aligned to twice its size. The value of N can be different for each buffer.
 //
 // These constrains guarantee that, given an address A of any element of the
@@ -55,7 +55,10 @@ static uptr RingBufferSize() {
   uptr desired_bytes = flags()->stack_history_size * sizeof(uptr);
   // FIXME: increase the limit to 8 once this bug is fixed:
   // https://bugs.llvm.org/show_bug.cgi?id=39030
-  for (int shift = 1; shift < 7; ++shift) {
+  // Note that we *cannot* do that on Android, as the runtime will indefinitely
+  // have to support code that is compiled with ashr, which only works with
+  // shifts up to 6.
+  for (int shift = 0; shift < 7; ++shift) {
     uptr size = 4096 * (1ULL << shift);
     if (size >= desired_bytes)
       return size;
diff --git a/flang/.gitignore b/flang/.gitignore
index 4da4ee1178ba..508e70c1e1ea 100644
--- a/flang/.gitignore
+++ b/flang/.gitignore
@@ -5,7 +5,6 @@ build
 root
 tags
 TAGS
-*.o
 .nfs*
 *.sw?
 *~
diff --git a/flang/include/flang/Common/windows-include.h b/flang/include/flang/Common/windows-include.h
new file mode 100644
index 000000000000..75ef4974251f
--- /dev/null
+++ b/flang/include/flang/Common/windows-include.h
@@ -0,0 +1,25 @@
+//===-- include/flang/Common/windows-include.h ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper around windows.h that works around the name conflicts.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_COMMON_WINDOWS_INCLUDE_H_
+#define FORTRAN_COMMON_WINDOWS_INCLUDE_H_
+
+#ifdef _WIN32
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+
+#include <windows.h>
+
+#endif // _WIN32
+
+#endif // FORTRAN_COMMON_WINDOWS_INCLUDE_H_
diff --git a/flang/include/flang/Runtime/io-api.h b/flang/include/flang/Runtime/io-api.h
index 328afc715a3f..1b6c4f5d6a65 100644
--- a/flang/include/flang/Runtime/io-api.h
+++ b/flang/include/flang/Runtime/io-api.h
@@ -92,18 +92,18 @@ constexpr std::size_t RecommendedInternalIoScratchAreaBytes(
 
 // Internal I/O to/from character arrays &/or non-default-kind character
 // requires a descriptor, which is copied.
-Cookie IODECL(BeginInternalArrayListOutput)(const Descriptor &,
+Cookie IONAME(BeginInternalArrayListOutput)(const Descriptor &,
     void **scratchArea = nullptr, std::size_t scratchBytes = 0,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginInternalArrayListInput)(const Descriptor &,
+Cookie IONAME(BeginInternalArrayListInput)(const Descriptor &,
     void **scratchArea = nullptr, std::size_t scratchBytes = 0,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginInternalArrayFormattedOutput)(const Descriptor &,
+Cookie IONAME(BeginInternalArrayFormattedOutput)(const Descriptor &,
     const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor = nullptr, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginInternalArrayFormattedInput)(const Descriptor &,
+Cookie IONAME(BeginInternalArrayFormattedInput)(const Descriptor &,
     const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor = nullptr, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
@@ -111,20 +111,20 @@ Cookie IODECL(BeginInternalArrayFormattedInput)(const Descriptor &,
 
 // Internal I/O to/from a default-kind character scalar can avoid a
 // descriptor.
-Cookie IODECL(BeginInternalListOutput)(char *internal,
+Cookie IONAME(BeginInternalListOutput)(char *internal,
     std::size_t internalLength, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginInternalListInput)(const char *internal,
+Cookie IONAME(BeginInternalListInput)(const char *internal,
     std::size_t internalLength, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginInternalFormattedOutput)(char *internal,
+Cookie IONAME(BeginInternalFormattedOutput)(char *internal,
     std::size_t internalLength, const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor = nullptr, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginInternalFormattedInput)(const char *internal,
+Cookie IONAME(BeginInternalFormattedInput)(const char *internal,
     std::size_t internalLength, const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor = nullptr, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
@@ -139,63 +139,63 @@ Cookie IODECL(BeginInternalFormattedInput)(const char *internal,
 // If handleError is false, and the unit number is out of range, the program
 // will be terminated. Otherwise, if unit is out of range, a nonzero Iostat
 // code is returned and ioMsg is set if it is not a nullptr.
-enum Iostat IODECL(CheckUnitNumberInRange64)(std::int64_t unit,
+enum Iostat IONAME(CheckUnitNumberInRange64)(std::int64_t unit,
     bool handleError, char *ioMsg = nullptr, std::size_t ioMsgLength = 0,
     const char *sourceFile = nullptr, int sourceLine = 0);
-enum Iostat IODECL(CheckUnitNumberInRange128)(common::int128_t unit,
+enum Iostat IONAME(CheckUnitNumberInRange128)(common::int128_t unit,
     bool handleError, char *ioMsg = nullptr, std::size_t ioMsgLength = 0,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // External synchronous I/O initiation
 Cookie IODECL(BeginExternalListOutput)(ExternalUnit = DefaultOutputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginExternalListInput)(ExternalUnit = DefaultInputUnit,
+Cookie IONAME(BeginExternalListInput)(ExternalUnit = DefaultInputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginExternalFormattedOutput)(const char *format, std::size_t,
+Cookie IONAME(BeginExternalFormattedOutput)(const char *format, std::size_t,
     const Descriptor *formatDescriptor = nullptr,
     ExternalUnit = DefaultOutputUnit, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginExternalFormattedInput)(const char *format, std::size_t,
+Cookie IONAME(BeginExternalFormattedInput)(const char *format, std::size_t,
     const Descriptor *formatDescriptor = nullptr,
     ExternalUnit = DefaultInputUnit, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginUnformattedOutput)(ExternalUnit = DefaultOutputUnit,
+Cookie IONAME(BeginUnformattedOutput)(ExternalUnit = DefaultOutputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginUnformattedInput)(ExternalUnit = DefaultInputUnit,
+Cookie IONAME(BeginUnformattedInput)(ExternalUnit = DefaultInputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // WAIT(ID=)
-Cookie IODECL(BeginWait)(ExternalUnit, AsynchronousId,
+Cookie IONAME(BeginWait)(ExternalUnit, AsynchronousId,
     const char *sourceFile = nullptr, int sourceLine = 0);
 // WAIT(no ID=)
-Cookie IODECL(BeginWaitAll)(
+Cookie IONAME(BeginWaitAll)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
 
 // Other I/O statements
-Cookie IODECL(BeginClose)(
+Cookie IONAME(BeginClose)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginFlush)(
+Cookie IONAME(BeginFlush)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginBackspace)(
+Cookie IONAME(BeginBackspace)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginEndfile)(
+Cookie IONAME(BeginEndfile)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginRewind)(
+Cookie IONAME(BeginRewind)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
 
 // OPEN(UNIT=) and OPEN(NEWUNIT=) have distinct interfaces.
-Cookie IODECL(BeginOpenUnit)(
+Cookie IONAME(BeginOpenUnit)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginOpenNewUnit)(
+Cookie IONAME(BeginOpenNewUnit)(
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // The variant forms of INQUIRE() statements have distinct interfaces.
 // BeginInquireIoLength() is basically a no-op output statement.
-Cookie IODECL(BeginInquireUnit)(
+Cookie IONAME(BeginInquireUnit)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginInquireFile)(const char *, std::size_t,
+Cookie IONAME(BeginInquireFile)(const char *, std::size_t,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginInquireIoLength)(
+Cookie IONAME(BeginInquireIoLength)(
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // If an I/O statement has any IOSTAT=, ERR=, END=, or EOR= specifiers,
@@ -214,33 +214,33 @@ Cookie IODECL(BeginInquireIoLength)(
 //     }
 //   }
 //   if (EndIoStatement(cookie) == FORTRAN_RUTIME_IOSTAT_END) goto label666;
-void IODECL(EnableHandlers)(Cookie, bool hasIoStat = false, bool hasErr = false,
+void IONAME(EnableHandlers)(Cookie, bool hasIoStat = false, bool hasErr = false,
     bool hasEnd = false, bool hasEor = false, bool hasIoMsg = false);
 
 // ASYNCHRONOUS='YES' or 'NO' on READ/WRITE/OPEN
 // Use GetAsynchronousId() to handle ID=.
-bool IODECL(SetAsynchronous)(Cookie, const char *, std::size_t);
+bool IONAME(SetAsynchronous)(Cookie, const char *, std::size_t);
 
 // Control list options.  These return false on a error that the
 // Begin...() call has specified will be handled by the caller.
 // The interfaces that pass a default-kind CHARACTER argument
 // are limited to passing specific case-insensitive keyword values.
 // ADVANCE=YES, NO
-bool IODECL(SetAdvance)(Cookie, const char *, std::size_t);
+bool IONAME(SetAdvance)(Cookie, const char *, std::size_t);
 // BLANK=NULL, ZERO
-bool IODECL(SetBlank)(Cookie, const char *, std::size_t);
+bool IONAME(SetBlank)(Cookie, const char *, std::size_t);
 // DECIMAL=COMMA, POINT
-bool IODECL(SetDecimal)(Cookie, const char *, std::size_t);
+bool IONAME(SetDecimal)(Cookie, const char *, std::size_t);
 // DELIM=APOSTROPHE, QUOTE, NONE
-bool IODECL(SetDelim)(Cookie, const char *, std::size_t);
+bool IONAME(SetDelim)(Cookie, const char *, std::size_t);
 // PAD=YES, NO
-bool IODECL(SetPad)(Cookie, const char *, std::size_t);
-bool IODECL(SetPos)(Cookie, std::int64_t);
-bool IODECL(SetRec)(Cookie, std::int64_t);
+bool IONAME(SetPad)(Cookie, const char *, std::size_t);
+bool IONAME(SetPos)(Cookie, std::int64_t);
+bool IONAME(SetRec)(Cookie, std::int64_t);
 // ROUND=UP, DOWN, ZERO, NEAREST, COMPATIBLE, PROCESSOR_DEFINED
-bool IODECL(SetRound)(Cookie, const char *, std::size_t);
+bool IONAME(SetRound)(Cookie, const char *, std::size_t);
 // SIGN=PLUS, SUPPRESS, PROCESSOR_DEFINED
-bool IODECL(SetSign)(Cookie, const char *, std::size_t);
+bool IONAME(SetSign)(Cookie, const char *, std::size_t);
 
 // Data item transfer for modes other than NAMELIST:
 // Any data object that can be passed as an actual argument without the
@@ -256,34 +256,34 @@ bool IODECL(SetSign)(Cookie, const char *, std::size_t);
 // Once the statement has encountered an error, all following items will be
 // ignored and also return false; but compiled code should check for errors
 // and avoid the following items when they might crash.
-bool IODECL(OutputDescriptor)(Cookie, const Descriptor &);
-bool IODECL(InputDescriptor)(Cookie, const Descriptor &);
+bool IONAME(OutputDescriptor)(Cookie, const Descriptor &);
+bool IONAME(InputDescriptor)(Cookie, const Descriptor &);
 // Formatted (including list directed) I/O data items
-bool IODECL(OutputInteger8)(Cookie, std::int8_t);
-bool IODECL(OutputInteger16)(Cookie, std::int16_t);
+bool IONAME(OutputInteger8)(Cookie, std::int8_t);
+bool IONAME(OutputInteger16)(Cookie, std::int16_t);
 bool IODECL(OutputInteger32)(Cookie, std::int32_t);
-bool IODECL(OutputInteger64)(Cookie, std::int64_t);
-bool IODECL(OutputInteger128)(Cookie, common::int128_t);
-bool IODECL(InputInteger)(Cookie, std::int64_t &, int kind = 8);
-bool IODECL(OutputReal32)(Cookie, float);
-bool IODECL(InputReal32)(Cookie, float &);
-bool IODECL(OutputReal64)(Cookie, double);
-bool IODECL(InputReal64)(Cookie, double &);
-bool IODECL(OutputComplex32)(Cookie, float, float);
-bool IODECL(InputComplex32)(Cookie, float[2]);
-bool IODECL(OutputComplex64)(Cookie, double, double);
-bool IODECL(InputComplex64)(Cookie, double[2]);
-bool IODECL(OutputCharacter)(Cookie, const char *, std::size_t, int kind = 1);
-bool IODECL(OutputAscii)(Cookie, const char *, std::size_t);
-bool IODECL(InputCharacter)(Cookie, char *, std::size_t, int kind = 1);
-bool IODECL(InputAscii)(Cookie, char *, std::size_t);
-bool IODECL(OutputLogical)(Cookie, bool);
-bool IODECL(InputLogical)(Cookie, bool &);
+bool IONAME(OutputInteger64)(Cookie, std::int64_t);
+bool IONAME(OutputInteger128)(Cookie, common::int128_t);
+bool IONAME(InputInteger)(Cookie, std::int64_t &, int kind = 8);
+bool IONAME(OutputReal32)(Cookie, float);
+bool IONAME(InputReal32)(Cookie, float &);
+bool IONAME(OutputReal64)(Cookie, double);
+bool IONAME(InputReal64)(Cookie, double &);
+bool IONAME(OutputComplex32)(Cookie, float, float);
+bool IONAME(InputComplex32)(Cookie, float[2]);
+bool IONAME(OutputComplex64)(Cookie, double, double);
+bool IONAME(InputComplex64)(Cookie, double[2]);
+bool IONAME(OutputCharacter)(Cookie, const char *, std::size_t, int kind = 1);
+bool IONAME(OutputAscii)(Cookie, const char *, std::size_t);
+bool IONAME(InputCharacter)(Cookie, char *, std::size_t, int kind = 1);
+bool IONAME(InputAscii)(Cookie, char *, std::size_t);
+bool IONAME(OutputLogical)(Cookie, bool);
+bool IONAME(InputLogical)(Cookie, bool &);
 
 // NAMELIST I/O must be the only data item in an (otherwise)
 // list-directed I/O statement.
-bool IODECL(OutputNamelist)(Cookie, const NamelistGroup &);
-bool IODECL(InputNamelist)(Cookie, const NamelistGroup &);
+bool IONAME(OutputNamelist)(Cookie, const NamelistGroup &);
+bool IONAME(InputNamelist)(Cookie, const NamelistGroup &);
 
 // When an I/O list item has a derived type with a specific defined
 // I/O subroutine of the appropriate generic kind for the active
@@ -294,9 +294,9 @@ bool IODECL(InputNamelist)(Cookie, const NamelistGroup &);
 // made such a generic interface inaccessible), these data item transfer
 // APIs enable the I/O runtime to make the right calls to defined I/O
 // subroutines.
-bool IODECL(OutputDerivedType)(
+bool IONAME(OutputDerivedType)(
     Cookie, const Descriptor &, const NonTbpDefinedIoTable *);
-bool IODECL(InputDerivedType)(
+bool IONAME(InputDerivedType)(
     Cookie, const Descriptor &, const NonTbpDefinedIoTable *);
 
 // Additional specifier interfaces for the connection-list of
@@ -304,56 +304,56 @@ bool IODECL(InputDerivedType)(
 // SetDelim(), GetIoMsg(), SetPad(), SetRound(), SetSign(),
 // & SetAsynchronous() are also acceptable for OPEN.
 // ACCESS=SEQUENTIAL, DIRECT, STREAM
-bool IODECL(SetAccess)(Cookie, const char *, std::size_t);
+bool IONAME(SetAccess)(Cookie, const char *, std::size_t);
 // ACTION=READ, WRITE, or READWRITE
-bool IODECL(SetAction)(Cookie, const char *, std::size_t);
+bool IONAME(SetAction)(Cookie, const char *, std::size_t);
 // CARRIAGECONTROL=LIST, FORTRAN, NONE
-bool IODECL(SetCarriagecontrol)(Cookie, const char *, std::size_t);
+bool IONAME(SetCarriagecontrol)(Cookie, const char *, std::size_t);
 // CONVERT=NATIVE, LITTLE_ENDIAN, BIG_ENDIAN, or SWAP
-bool IODECL(SetConvert)(Cookie, const char *, std::size_t);
+bool IONAME(SetConvert)(Cookie, const char *, std::size_t);
 // ENCODING=UTF-8, DEFAULT
-bool IODECL(SetEncoding)(Cookie, const char *, std::size_t);
+bool IONAME(SetEncoding)(Cookie, const char *, std::size_t);
 // FORM=FORMATTED, UNFORMATTED
-bool IODECL(SetForm)(Cookie, const char *, std::size_t);
+bool IONAME(SetForm)(Cookie, const char *, std::size_t);
 // POSITION=ASIS, REWIND, APPEND
-bool IODECL(SetPosition)(Cookie, const char *, std::size_t);
-bool IODECL(SetRecl)(Cookie, std::size_t); // RECL=
+bool IONAME(SetPosition)(Cookie, const char *, std::size_t);
+bool IONAME(SetRecl)(Cookie, std::size_t); // RECL=
 
 // STATUS can be set during an OPEN or CLOSE statement.
 // For OPEN: STATUS=OLD, NEW, SCRATCH, REPLACE, UNKNOWN
 // For CLOSE: STATUS=KEEP, DELETE
-bool IODECL(SetStatus)(Cookie, const char *, std::size_t);
+bool IONAME(SetStatus)(Cookie, const char *, std::size_t);
 
-bool IODECL(SetFile)(Cookie, const char *, std::size_t chars);
+bool IONAME(SetFile)(Cookie, const char *, std::size_t chars);
 
 // Acquires the runtime-created unit number for OPEN(NEWUNIT=)
-bool IODECL(GetNewUnit)(Cookie, int &, int kind = 4);
+bool IONAME(GetNewUnit)(Cookie, int &, int kind = 4);
 
 // READ(SIZE=), after all input items
-std::size_t IODECL(GetSize)(Cookie);
+std::size_t IONAME(GetSize)(Cookie);
 
 // INQUIRE(IOLENGTH=), after all output items
-std::size_t IODECL(GetIoLength)(Cookie);
+std::size_t IONAME(GetIoLength)(Cookie);
 
 // GetIoMsg() does not modify its argument unless an error or
 // end-of-record/file condition is present.
-void IODECL(GetIoMsg)(Cookie, char *, std::size_t); // IOMSG=
+void IONAME(GetIoMsg)(Cookie, char *, std::size_t); // IOMSG=
 
 // Defines ID= on READ/WRITE(ASYNCHRONOUS='YES')
-AsynchronousId IODECL(GetAsynchronousId)(Cookie);
+AsynchronousId IONAME(GetAsynchronousId)(Cookie);
 
 // INQUIRE() specifiers are mostly identified by their NUL-terminated
 // case-insensitive names.
 // ACCESS, ACTION, ASYNCHRONOUS, BLANK, CONVERT, DECIMAL, DELIM, DIRECT,
 // ENCODING, FORM, FORMATTED, NAME, PAD, POSITION, READ, READWRITE, ROUND,
 // SEQUENTIAL, SIGN, STREAM, UNFORMATTED, WRITE:
-bool IODECL(InquireCharacter)(Cookie, InquiryKeywordHash, char *, std::size_t);
+bool IONAME(InquireCharacter)(Cookie, InquiryKeywordHash, char *, std::size_t);
 // EXIST, NAMED, OPENED, and PENDING (without ID):
-bool IODECL(InquireLogical)(Cookie, InquiryKeywordHash, bool &);
+bool IONAME(InquireLogical)(Cookie, InquiryKeywordHash, bool &);
 // PENDING with ID
-bool IODECL(InquirePendingId)(Cookie, AsynchronousId, bool &);
+bool IONAME(InquirePendingId)(Cookie, AsynchronousId, bool &);
 // NEXTREC, NUMBER, POS, RECL, SIZE
-bool IODECL(InquireInteger64)(
+bool IONAME(InquireInteger64)(
     Cookie, InquiryKeywordHash, std::int64_t &, int kind = 8);
 
 // This function must be called to end an I/O statement, and its
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 6eba243c237c..315a3f6736aa 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -1340,15 +1340,6 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   } else {
     addr = hlfir::genVariableRawAddress(loc, builder, entity);
   }
-  // The last extent created for assumed-rank descriptors must be -1 (18.5.3
-  // point 5.). This should be done when creating the assumed-size shape for
-  // consistency.
-  if (auto baseBoxDummy = mlir::dyn_cast<fir::BaseBoxType>(dummyType))
-    if (baseBoxDummy.isAssumedRank())
-      if (const Fortran::semantics::Symbol *sym =
-              Fortran::evaluate::UnwrapWholeSymbolDataRef(*arg.entity))
-        if (Fortran::semantics::IsAssumedSizeArray(sym->GetUltimate()))
-          TODO(loc, "passing assumed-size to assumed-rank array");
 
   // For ranked actual passed to assumed-rank dummy, the cast to assumed-rank
   // box is inserted when building the fir.call op. Inserting it here would
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 0d05ca5aee65..c1c94119fd90 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -13,6 +13,7 @@
 #include "ReductionProcessor.h"
 
 #include "flang/Lower/AbstractConverter.h"
+#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
@@ -522,12 +523,20 @@ void ReductionProcessor::addDeclareReduction(
     if (reductionSymbols)
       reductionSymbols->push_back(symbol);
     mlir::Value symVal = converter.getSymbolAddress(*symbol);
-    auto redType = mlir::cast<fir::ReferenceType>(symVal.getType());
+    mlir::Type eleType;
+    auto refType = mlir::dyn_cast_or_null<fir::ReferenceType>(symVal.getType());
+    if (refType)
+      eleType = refType.getEleTy();
+    else
+      eleType = symVal.getType();
 
     // all arrays must be boxed so that we have convenient access to all the
     // information needed to iterate over the array
-    if (mlir::isa<fir::SequenceType>(redType.getEleTy())) {
-      hlfir::Entity entity{symVal};
+    if (mlir::isa<fir::SequenceType>(eleType)) {
+      // For Host associated symbols, use `SymbolBox` instead
+      Fortran::lower::SymbolBox symBox =
+          converter.lookupOneLevelUpSymbol(*symbol);
+      hlfir::Entity entity{symBox.getAddr()};
       entity = genVariableBox(currentLocation, builder, entity);
       mlir::Value box = entity.getBase();
 
@@ -538,11 +547,25 @@ void ReductionProcessor::addDeclareReduction(
       builder.create<fir::StoreOp>(currentLocation, box, alloca);
 
       symVal = alloca;
-      redType = mlir::cast<fir::ReferenceType>(symVal.getType());
+    } else if (mlir::isa<fir::BaseBoxType>(symVal.getType())) {
+      // boxed arrays are passed as values not by reference. Unfortunately,
+      // we can't pass a box by value to omp.redution_declare, so turn it
+      // into a reference
+
+      auto alloca =
+          builder.create<fir::AllocaOp>(currentLocation, symVal.getType());
+      builder.create<fir::StoreOp>(currentLocation, symVal, alloca);
+      symVal = alloca;
     } else if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>()) {
       symVal = declOp.getBase();
     }
 
+    // this isn't the same as the by-val and by-ref passing later in the
+    // pipeline. Both styles assume that the variable is a reference at
+    // this point
+    assert(mlir::isa<fir::ReferenceType>(symVal.getType()) &&
+           "reduction input var is a reference");
+
     reductionVars.push_back(symVal);
   }
   const bool isByRef = doReductionByRef(reductionVars);
diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index e7695929623f..3474832bdb22 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -55,6 +55,8 @@ static void genUnreachable(fir::FirOpBuilder &builder, mlir::Location loc) {
 void Fortran::lower::genStopStatement(
     Fortran::lower::AbstractConverter &converter,
     const Fortran::parser::StopStmt &stmt) {
+  const bool isError = std::get<Fortran::parser::StopStmt::Kind>(stmt.t) ==
+                       Fortran::parser::StopStmt::Kind::ErrorStop;
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Location loc = converter.getCurrentLocation();
   Fortran::lower::StatementContext stmtCtx;
@@ -94,13 +96,12 @@ void Fortran::lower::genStopStatement(
   } else {
     callee = fir::runtime::getRuntimeFunc<mkRTKey(StopStatement)>(loc, builder);
     calleeType = callee.getFunctionType();
-    operands.push_back(
-        builder.createIntegerConstant(loc, calleeType.getInput(0), 0));
+    // Default to values are advised in F'2023 11.4 p2.
+    operands.push_back(builder.createIntegerConstant(
+        loc, calleeType.getInput(0), isError ? 1 : 0));
   }
 
   // Second operand indicates ERROR STOP
-  bool isError = std::get<Fortran::parser::StopStmt::Kind>(stmt.t) ==
-                 Fortran::parser::StopStmt::Kind::ErrorStop;
   operands.push_back(builder.createIntegerConstant(
       loc, calleeType.getInput(operands.size()), isError));
 
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index dec8fee774c5..b2de37759a06 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -948,6 +948,11 @@ void CheckHelper::CheckObjectEntity(
             "Component '%s' with ATTRIBUTES(DEVICE) must also be allocatable"_err_en_US,
             symbol.name());
       }
+      if (IsAssumedSizeArray(symbol)) {
+        messages_.Say(
+            "Object '%s' with ATTRIBUTES(DEVICE) may not be assumed size"_err_en_US,
+            symbol.name());
+      }
       break;
     case common::CUDADataAttr::Managed:
       if (!IsAutomatic(symbol) && !IsAllocatable(symbol) &&
diff --git a/flang/runtime/command.cpp b/flang/runtime/command.cpp
index fabfe601688b..b573c5dfd797 100644
--- a/flang/runtime/command.cpp
+++ b/flang/runtime/command.cpp
@@ -16,9 +16,7 @@
 #include <limits>
 
 #ifdef _WIN32
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
+#include "flang/Common/windows-include.h"
 
 // On Windows GetCurrentProcessId returns a DWORD aka uint32_t
 #include <processthreadsapi.h>
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index b2c9665a28df..b74067a37777 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -49,7 +49,6 @@ static void SetEnvironmentDefaults(const EnvironmentDefaultList *envDefaults) {
   }
 }
 
-RT_OFFLOAD_API_GROUP_BEGIN
 Fortran::common::optional<Convert> GetConvertFromString(
     const char *x, std::size_t n) {
   static const char *keywords[]{
@@ -69,7 +68,6 @@ Fortran::common::optional<Convert> GetConvertFromString(
     return Fortran::common::nullopt;
   }
 }
-RT_OFFLOAD_API_GROUP_END
 
 void ExecutionEnvironment::Configure(int ac, const char *av[],
     const char *env[], const EnvironmentDefaultList *envDefaults) {
diff --git a/flang/runtime/environment.h b/flang/runtime/environment.h
index b8b9f10e4e57..6c56993fb1d6 100644
--- a/flang/runtime/environment.h
+++ b/flang/runtime/environment.h
@@ -31,7 +31,7 @@ RT_OFFLOAD_VAR_GROUP_END
 // External unformatted I/O data conversions
 enum class Convert { Unknown, Native, LittleEndian, BigEndian, Swap };
 
-RT_API_ATTRS Fortran::common::optional<Convert> GetConvertFromString(
+Fortran::common::optional<Convert> GetConvertFromString(
     const char *, std::size_t);
 
 struct ExecutionEnvironment {
diff --git a/flang/runtime/execute.cpp b/flang/runtime/execute.cpp
index c84930c5c328..0f5bc5059e21 100644
--- a/flang/runtime/execute.cpp
+++ b/flang/runtime/execute.cpp
@@ -16,9 +16,7 @@
 #include <future>
 #include <limits>
 #ifdef _WIN32
-#define LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
+#include "flang/Common/windows-include.h"
 #else
 #include <signal.h>
 #include <sys/wait.h>
diff --git a/flang/runtime/file.cpp b/flang/runtime/file.cpp
index 67764f1f5626..acd5d33d4bb8 100644
--- a/flang/runtime/file.cpp
+++ b/flang/runtime/file.cpp
@@ -17,9 +17,8 @@
 #include <stdlib.h>
 #include <sys/stat.h>
 #ifdef _WIN32
-#define NOMINMAX
+#include "flang/Common/windows-include.h"
 #include <io.h>
-#include <windows.h>
 #else
 #include <unistd.h>
 #endif
diff --git a/flang/runtime/freestanding-tools.h b/flang/runtime/freestanding-tools.h
index 9089dc6bcf53..451bf13b9fa6 100644
--- a/flang/runtime/freestanding-tools.h
+++ b/flang/runtime/freestanding-tools.h
@@ -52,11 +52,6 @@
 #define STD_STRCPY_UNSUPPORTED 1
 #endif
 
-#if !defined(STD_STRCMP_UNSUPPORTED) && \
-    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
-#define STD_STRCMP_UNSUPPORTED 1
-#endif
-
 namespace Fortran::runtime {
 
 #if STD_FILL_N_UNSUPPORTED
@@ -181,19 +176,5 @@ static inline RT_API_ATTRS char *strcpy(char *dest, const char *src) {
 using std::strcpy;
 #endif // !STD_STRCPY_UNSUPPORTED
 
-#if STD_STRCMP_UNSUPPORTED
-// Provides alternative implementation for std::strcmp(), if
-// it is not supported.
-static inline RT_API_ATTRS int strcmp(const char *lhs, const char *rhs) {
-  while (*lhs != '\0' && *lhs == *rhs) {
-    ++lhs;
-    ++rhs;
-  }
-  return static_cast<unsigned char>(*lhs) - static_cast<unsigned char>(*rhs);
-}
-#else // !STD_STRCMP_UNSUPPORTED
-using std::strcmp;
-#endif // !STD_STRCMP_UNSUPPORTED
-
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
index ccb5b576451d..3a86c9fa7375 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -25,9 +25,8 @@
 #include <memory>
 
 namespace Fortran::runtime::io {
-RT_EXT_API_GROUP_BEGIN
 
-RT_API_ATTRS const char *InquiryKeywordHashDecode(
+const char *InquiryKeywordHashDecode(
     char *buffer, std::size_t n, InquiryKeywordHash hash) {
   if (n < 1) {
     return nullptr;
@@ -45,7 +44,7 @@ RT_API_ATTRS const char *InquiryKeywordHashDecode(
 }
 
 template <Direction DIR>
-RT_API_ATTRS Cookie BeginInternalArrayListIO(const Descriptor &descriptor,
+Cookie BeginInternalArrayListIO(const Descriptor &descriptor,
     void ** /*scratchArea*/, std::size_t /*scratchBytes*/,
     const char *sourceFile, int sourceLine) {
   Terminator oom{sourceFile, sourceLine};
@@ -55,14 +54,14 @@ RT_API_ATTRS Cookie BeginInternalArrayListIO(const Descriptor &descriptor,
               ->ioStatementState();
 }
 
-Cookie IODEF(BeginInternalArrayListOutput)(const Descriptor &descriptor,
+Cookie IONAME(BeginInternalArrayListOutput)(const Descriptor &descriptor,
     void **scratchArea, std::size_t scratchBytes, const char *sourceFile,
     int sourceLine) {
   return BeginInternalArrayListIO<Direction::Output>(
       descriptor, scratchArea, scratchBytes, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginInternalArrayListInput)(const Descriptor &descriptor,
+Cookie IONAME(BeginInternalArrayListInput)(const Descriptor &descriptor,
     void **scratchArea, std::size_t scratchBytes, const char *sourceFile,
     int sourceLine) {
   return BeginInternalArrayListIO<Direction::Input>(
@@ -70,7 +69,7 @@ Cookie IODEF(BeginInternalArrayListInput)(const Descriptor &descriptor,
 }
 
 template <Direction DIR>
-RT_API_ATTRS Cookie BeginInternalArrayFormattedIO(const Descriptor &descriptor,
+Cookie BeginInternalArrayFormattedIO(const Descriptor &descriptor,
     const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void ** /*scratchArea*/,
     std::size_t /*scratchBytes*/, const char *sourceFile, int sourceLine) {
@@ -81,7 +80,7 @@ RT_API_ATTRS Cookie BeginInternalArrayFormattedIO(const Descriptor &descriptor,
               ->ioStatementState();
 }
 
-Cookie IODEF(BeginInternalArrayFormattedOutput)(const Descriptor &descriptor,
+Cookie IONAME(BeginInternalArrayFormattedOutput)(const Descriptor &descriptor,
     const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void **scratchArea,
     std::size_t scratchBytes, const char *sourceFile, int sourceLine) {
@@ -90,7 +89,7 @@ Cookie IODEF(BeginInternalArrayFormattedOutput)(const Descriptor &descriptor,
       sourceLine);
 }
 
-Cookie IODEF(BeginInternalArrayFormattedInput)(const Descriptor &descriptor,
+Cookie IONAME(BeginInternalArrayFormattedInput)(const Descriptor &descriptor,
     const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void **scratchArea,
     std::size_t scratchBytes, const char *sourceFile, int sourceLine) {
@@ -111,14 +110,14 @@ RT_API_ATTRS Cookie BeginInternalListIO(
               ->ioStatementState();
 }
 
-Cookie IODEF(BeginInternalListOutput)(char *internal,
+Cookie IONAME(BeginInternalListOutput)(char *internal,
     std::size_t internalLength, void **scratchArea, std::size_t scratchBytes,
     const char *sourceFile, int sourceLine) {
   return BeginInternalListIO<Direction::Output>(internal, internalLength,
       scratchArea, scratchBytes, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginInternalListInput)(const char *internal,
+Cookie IONAME(BeginInternalListInput)(const char *internal,
     std::size_t internalLength, void **scratchArea, std::size_t scratchBytes,
     const char *sourceFile, int sourceLine) {
   return BeginInternalListIO<Direction::Input>(internal, internalLength,
@@ -126,7 +125,7 @@ Cookie IODEF(BeginInternalListInput)(const char *internal,
 }
 
 template <Direction DIR>
-RT_API_ATTRS Cookie BeginInternalFormattedIO(
+Cookie BeginInternalFormattedIO(
     std::conditional_t<DIR == Direction::Input, const char, char> *internal,
     std::size_t internalLength, const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void ** /*scratchArea*/,
@@ -139,7 +138,7 @@ RT_API_ATTRS Cookie BeginInternalFormattedIO(
               ->ioStatementState();
 }
 
-Cookie IODEF(BeginInternalFormattedOutput)(char *internal,
+Cookie IONAME(BeginInternalFormattedOutput)(char *internal,
     std::size_t internalLength, const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void **scratchArea,
     std::size_t scratchBytes, const char *sourceFile, int sourceLine) {
@@ -148,7 +147,7 @@ Cookie IODEF(BeginInternalFormattedOutput)(char *internal,
       sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginInternalFormattedInput)(const char *internal,
+Cookie IONAME(BeginInternalFormattedInput)(const char *internal,
     std::size_t internalLength, const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void **scratchArea,
     std::size_t scratchBytes, const char *sourceFile, int sourceLine) {
@@ -228,22 +227,24 @@ RT_API_ATTRS Cookie BeginExternalListIO(
   }
 }
 
+RT_EXT_API_GROUP_BEGIN
 Cookie IODEF(BeginExternalListOutput)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginExternalListIO<Direction::Output, ExternalListIoStatementState>(
       unitNumber, sourceFile, sourceLine);
 }
+RT_EXT_API_GROUP_END
 
-Cookie IODEF(BeginExternalListInput)(
+Cookie IONAME(BeginExternalListInput)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginExternalListIO<Direction::Input, ExternalListIoStatementState>(
       unitNumber, sourceFile, sourceLine);
 }
 
 template <Direction DIR>
-RT_API_ATTRS Cookie BeginExternalFormattedIO(const char *format,
-    std::size_t formatLength, const Descriptor *formatDescriptor,
-    ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
+Cookie BeginExternalFormattedIO(const char *format, std::size_t formatLength,
+    const Descriptor *formatDescriptor, ExternalUnit unitNumber,
+    const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
   ExternalFileUnit *unit{GetOrCreateUnit(
@@ -285,14 +286,14 @@ RT_API_ATTRS Cookie BeginExternalFormattedIO(const char *format,
   }
 }
 
-Cookie IODEF(BeginExternalFormattedOutput)(const char *format,
+Cookie IONAME(BeginExternalFormattedOutput)(const char *format,
     std::size_t formatLength, const Descriptor *formatDescriptor,
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginExternalFormattedIO<Direction::Output>(format, formatLength,
       formatDescriptor, unitNumber, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginExternalFormattedInput)(const char *format,
+Cookie IONAME(BeginExternalFormattedInput)(const char *format,
     std::size_t formatLength, const Descriptor *formatDescriptor,
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginExternalFormattedIO<Direction::Input>(format, formatLength,
@@ -300,7 +301,7 @@ Cookie IODEF(BeginExternalFormattedInput)(const char *format,
 }
 
 template <Direction DIR>
-RT_API_ATTRS Cookie BeginUnformattedIO(
+Cookie BeginUnformattedIO(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
@@ -351,19 +352,19 @@ RT_API_ATTRS Cookie BeginUnformattedIO(
   }
 }
 
-Cookie IODEF(BeginUnformattedOutput)(
+Cookie IONAME(BeginUnformattedOutput)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginUnformattedIO<Direction::Output>(
       unitNumber, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginUnformattedInput)(
+Cookie IONAME(BeginUnformattedInput)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginUnformattedIO<Direction::Input>(
       unitNumber, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginOpenUnit)( // OPEN(without NEWUNIT=)
+Cookie IONAME(BeginOpenUnit)( // OPEN(without NEWUNIT=)
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   bool wasExtant{false};
@@ -383,7 +384,7 @@ Cookie IODEF(BeginOpenUnit)( // OPEN(without NEWUNIT=)
   }
 }
 
-Cookie IODEF(BeginOpenNewUnit)( // OPEN(NEWUNIT=j)
+Cookie IONAME(BeginOpenNewUnit)( // OPEN(NEWUNIT=j)
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   ExternalFileUnit &unit{
@@ -393,7 +394,7 @@ Cookie IODEF(BeginOpenNewUnit)( // OPEN(NEWUNIT=j)
       sourceLine);
 }
 
-Cookie IODEF(BeginWait)(ExternalUnit unitNumber, AsynchronousId id,
+Cookie IONAME(BeginWait)(ExternalUnit unitNumber, AsynchronousId id,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) {
@@ -409,12 +410,12 @@ Cookie IODEF(BeginWait)(ExternalUnit unitNumber, AsynchronousId id,
         terminator, unitNumber, id == 0 ? IostatOk : IostatBadWaitUnit);
   }
 }
-Cookie IODEF(BeginWaitAll)(
+Cookie IONAME(BeginWaitAll)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return IONAME(BeginWait)(unitNumber, 0 /*no ID=*/, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginClose)(
+Cookie IONAME(BeginClose)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) {
@@ -433,7 +434,7 @@ Cookie IODEF(BeginClose)(
   }
 }
 
-Cookie IODEF(BeginFlush)(
+Cookie IONAME(BeginFlush)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) {
@@ -451,7 +452,7 @@ Cookie IODEF(BeginFlush)(
   }
 }
 
-Cookie IODEF(BeginBackspace)(
+Cookie IONAME(BeginBackspace)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) {
@@ -469,7 +470,7 @@ Cookie IODEF(BeginBackspace)(
   }
 }
 
-Cookie IODEF(BeginEndfile)(
+Cookie IONAME(BeginEndfile)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
@@ -489,7 +490,7 @@ Cookie IODEF(BeginEndfile)(
   }
 }
 
-Cookie IODEF(BeginRewind)(
+Cookie IONAME(BeginRewind)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
@@ -509,7 +510,7 @@ Cookie IODEF(BeginRewind)(
   }
 }
 
-Cookie IODEF(BeginInquireUnit)(
+Cookie IONAME(BeginInquireUnit)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) {
@@ -529,14 +530,14 @@ Cookie IODEF(BeginInquireUnit)(
   }
 }
 
-Cookie IODEF(BeginInquireFile)(const char *path, std::size_t pathLength,
+Cookie IONAME(BeginInquireFile)(const char *path, std::size_t pathLength,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   auto trimmed{SaveDefaultCharacter(
       path, TrimTrailingSpaces(path, pathLength), terminator)};
   if (ExternalFileUnit *
       unit{ExternalFileUnit::LookUp(
-          trimmed.get(), Fortran::runtime::strlen(trimmed.get()))}) {
+          trimmed.get(), std::strlen(trimmed.get()))}) {
     // INQUIRE(FILE=) to a connected unit
     if (ChildIo * child{unit->GetChildIo()}) {
       return &child->BeginIoStatement<InquireUnitState>(
@@ -553,7 +554,7 @@ Cookie IODEF(BeginInquireFile)(const char *path, std::size_t pathLength,
   }
 }
 
-Cookie IODEF(BeginInquireIoLength)(const char *sourceFile, int sourceLine) {
+Cookie IONAME(BeginInquireIoLength)(const char *sourceFile, int sourceLine) {
   Terminator oom{sourceFile, sourceLine};
   return &New<InquireIOLengthState>{oom}(sourceFile, sourceLine)
               .release()
@@ -562,7 +563,7 @@ Cookie IODEF(BeginInquireIoLength)(const char *sourceFile, int sourceLine) {
 
 // Control list items
 
-void IODEF(EnableHandlers)(Cookie cookie, bool hasIoStat, bool hasErr,
+void IONAME(EnableHandlers)(Cookie cookie, bool hasIoStat, bool hasErr,
     bool hasEnd, bool hasEor, bool hasIoMsg) {
   IoErrorHandler &handler{cookie->GetIoErrorHandler()};
   if (hasIoStat) {
@@ -582,8 +583,8 @@ void IODEF(EnableHandlers)(Cookie cookie, bool hasIoStat, bool hasErr,
   }
 }
 
-static RT_API_ATTRS bool YesOrNo(const char *keyword, std::size_t length,
-    const char *what, IoErrorHandler &handler) {
+static bool YesOrNo(const char *keyword, std::size_t length, const char *what,
+    IoErrorHandler &handler) {
   static const char *keywords[]{"YES", "NO", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
   case 0:
@@ -597,7 +598,8 @@ static RT_API_ATTRS bool YesOrNo(const char *keyword, std::size_t length,
   }
 }
 
-bool IODEF(SetAdvance)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetAdvance)(
+    Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   bool nonAdvancing{!YesOrNo(keyword, length, "ADVANCE", handler)};
@@ -614,7 +616,7 @@ bool IODEF(SetAdvance)(Cookie cookie, const char *keyword, std::size_t length) {
   return !handler.InError();
 }
 
-bool IODEF(SetBlank)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetBlank)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   static const char *keywords[]{"NULL", "ZERO", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
@@ -631,7 +633,8 @@ bool IODEF(SetBlank)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetDecimal)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetDecimal)(
+    Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   static const char *keywords[]{"COMMA", "POINT", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
@@ -648,7 +651,7 @@ bool IODEF(SetDecimal)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetDelim)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetDelim)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   static const char *keywords[]{"APOSTROPHE", "QUOTE", "NONE", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
@@ -668,14 +671,14 @@ bool IODEF(SetDelim)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetPad)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetPad)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   io.mutableModes().pad = YesOrNo(keyword, length, "PAD", handler);
   return !handler.InError();
 }
 
-bool IODEF(SetPos)(Cookie cookie, std::int64_t pos) {
+bool IONAME(SetPos)(Cookie cookie, std::int64_t pos) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (auto *unit{io.GetExternalFileUnit()}) {
@@ -686,7 +689,7 @@ bool IODEF(SetPos)(Cookie cookie, std::int64_t pos) {
   return false;
 }
 
-bool IODEF(SetRec)(Cookie cookie, std::int64_t rec) {
+bool IONAME(SetRec)(Cookie cookie, std::int64_t rec) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (auto *unit{io.GetExternalFileUnit()}) {
@@ -702,7 +705,7 @@ bool IODEF(SetRec)(Cookie cookie, std::int64_t rec) {
   return true;
 }
 
-bool IODEF(SetRound)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetRound)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   static const char *keywords[]{"UP", "DOWN", "ZERO", "NEAREST", "COMPATIBLE",
       "PROCESSOR_DEFINED", nullptr};
@@ -732,7 +735,7 @@ bool IODEF(SetRound)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetSign)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetSign)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   static const char *keywords[]{
       "PLUS", "SUPPRESS", "PROCESSOR_DEFINED", nullptr};
@@ -751,7 +754,7 @@ bool IODEF(SetSign)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetAccess)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetAccess)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -787,7 +790,7 @@ bool IODEF(SetAccess)(Cookie cookie, const char *keyword, std::size_t length) {
   return true;
 }
 
-bool IODEF(SetAction)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetAction)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -829,7 +832,7 @@ bool IODEF(SetAction)(Cookie cookie, const char *keyword, std::size_t length) {
   return true;
 }
 
-bool IODEF(SetAsynchronous)(
+bool IONAME(SetAsynchronous)(
     Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -856,7 +859,7 @@ bool IODEF(SetAsynchronous)(
   return !handler.InError();
 }
 
-bool IODEF(SetCarriagecontrol)(
+bool IONAME(SetCarriagecontrol)(
     Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
@@ -888,7 +891,8 @@ bool IODEF(SetCarriagecontrol)(
   }
 }
 
-bool IODEF(SetConvert)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetConvert)(
+    Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -912,7 +916,7 @@ bool IODEF(SetConvert)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetEncoding)(
+bool IONAME(SetEncoding)(
     Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
@@ -944,7 +948,7 @@ bool IODEF(SetEncoding)(
   return true;
 }
 
-bool IODEF(SetForm)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetForm)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -972,7 +976,7 @@ bool IODEF(SetForm)(Cookie cookie, const char *keyword, std::size_t length) {
   return true;
 }
 
-bool IODEF(SetPosition)(
+bool IONAME(SetPosition)(
     Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
@@ -1005,7 +1009,7 @@ bool IODEF(SetPosition)(
   return true;
 }
 
-bool IODEF(SetRecl)(Cookie cookie, std::size_t n) {
+bool IONAME(SetRecl)(Cookie cookie, std::size_t n) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -1032,7 +1036,7 @@ bool IODEF(SetRecl)(Cookie cookie, std::size_t n) {
   }
 }
 
-bool IODEF(SetStatus)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetStatus)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   if (auto *open{io.get_if<OpenStatementState>()}) {
     if (open->completedOperation()) {
@@ -1086,7 +1090,7 @@ bool IODEF(SetStatus)(Cookie cookie, const char *keyword, std::size_t length) {
       "SetStatus() called when not in an OPEN or CLOSE statement");
 }
 
-bool IODEF(SetFile)(Cookie cookie, const char *path, std::size_t chars) {
+bool IONAME(SetFile)(Cookie cookie, const char *path, std::size_t chars) {
   IoStatementState &io{*cookie};
   if (auto *open{io.get_if<OpenStatementState>()}) {
     if (open->completedOperation()) {
@@ -1103,7 +1107,7 @@ bool IODEF(SetFile)(Cookie cookie, const char *path, std::size_t chars) {
   return false;
 }
 
-bool IODEF(GetNewUnit)(Cookie cookie, int &unit, int kind) {
+bool IONAME(GetNewUnit)(Cookie cookie, int &unit, int kind) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -1131,15 +1135,15 @@ bool IODEF(GetNewUnit)(Cookie cookie, int &unit, int kind) {
 
 // Data transfers
 
-bool IODEF(OutputDescriptor)(Cookie cookie, const Descriptor &descriptor) {
+bool IONAME(OutputDescriptor)(Cookie cookie, const Descriptor &descriptor) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(InputDescriptor)(Cookie cookie, const Descriptor &descriptor) {
+bool IONAME(InputDescriptor)(Cookie cookie, const Descriptor &descriptor) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(OutputInteger8)(Cookie cookie, std::int8_t n) {
+bool IONAME(OutputInteger8)(Cookie cookie, std::int8_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger8")) {
     return false;
   }
@@ -1150,7 +1154,7 @@ bool IODEF(OutputInteger8)(Cookie cookie, std::int8_t n) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(OutputInteger16)(Cookie cookie, std::int16_t n) {
+bool IONAME(OutputInteger16)(Cookie cookie, std::int16_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger16")) {
     return false;
   }
@@ -1161,6 +1165,7 @@ bool IODEF(OutputInteger16)(Cookie cookie, std::int16_t n) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
+RT_EXT_API_GROUP_BEGIN
 bool IODEF(OutputInteger32)(Cookie cookie, std::int32_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger32")) {
     return false;
@@ -1171,8 +1176,9 @@ bool IODEF(OutputInteger32)(Cookie cookie, std::int32_t n) {
       TypeCategory::Integer, 4, reinterpret_cast<void *>(&n), 0);
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
+RT_EXT_API_GROUP_END
 
-bool IODEF(OutputInteger64)(Cookie cookie, std::int64_t n) {
+bool IONAME(OutputInteger64)(Cookie cookie, std::int64_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger64")) {
     return false;
   }
@@ -1184,7 +1190,7 @@ bool IODEF(OutputInteger64)(Cookie cookie, std::int64_t n) {
 }
 
 #ifdef __SIZEOF_INT128__
-bool IODEF(OutputInteger128)(Cookie cookie, common::int128_t n) {
+bool IONAME(OutputInteger128)(Cookie cookie, common::int128_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger128")) {
     return false;
   }
@@ -1196,7 +1202,7 @@ bool IODEF(OutputInteger128)(Cookie cookie, common::int128_t n) {
 }
 #endif
 
-bool IODEF(InputInteger)(Cookie cookie, std::int64_t &n, int kind) {
+bool IONAME(InputInteger)(Cookie cookie, std::int64_t &n, int kind) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputInteger")) {
     return false;
   }
@@ -1207,7 +1213,7 @@ bool IODEF(InputInteger)(Cookie cookie, std::int64_t &n, int kind) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(OutputReal32)(Cookie cookie, float x) {
+bool IONAME(OutputReal32)(Cookie cookie, float x) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputReal32")) {
     return false;
   }
@@ -1217,7 +1223,7 @@ bool IODEF(OutputReal32)(Cookie cookie, float x) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(OutputReal64)(Cookie cookie, double x) {
+bool IONAME(OutputReal64)(Cookie cookie, double x) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputReal64")) {
     return false;
   }
@@ -1227,7 +1233,7 @@ bool IODEF(OutputReal64)(Cookie cookie, double x) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(InputReal32)(Cookie cookie, float &x) {
+bool IONAME(InputReal32)(Cookie cookie, float &x) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputReal32")) {
     return false;
   }
@@ -1237,7 +1243,7 @@ bool IODEF(InputReal32)(Cookie cookie, float &x) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(InputReal64)(Cookie cookie, double &x) {
+bool IONAME(InputReal64)(Cookie cookie, double &x) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputReal64")) {
     return false;
   }
@@ -1247,7 +1253,7 @@ bool IODEF(InputReal64)(Cookie cookie, double &x) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(OutputComplex32)(Cookie cookie, float r, float i) {
+bool IONAME(OutputComplex32)(Cookie cookie, float r, float i) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputComplex32")) {
     return false;
   }
@@ -1259,7 +1265,7 @@ bool IODEF(OutputComplex32)(Cookie cookie, float r, float i) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(OutputComplex64)(Cookie cookie, double r, double i) {
+bool IONAME(OutputComplex64)(Cookie cookie, double r, double i) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputComplex64")) {
     return false;
   }
@@ -1271,7 +1277,7 @@ bool IODEF(OutputComplex64)(Cookie cookie, double r, double i) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(InputComplex32)(Cookie cookie, float z[2]) {
+bool IONAME(InputComplex32)(Cookie cookie, float z[2]) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputComplex32")) {
     return false;
   }
@@ -1282,7 +1288,7 @@ bool IODEF(InputComplex32)(Cookie cookie, float z[2]) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(InputComplex64)(Cookie cookie, double z[2]) {
+bool IONAME(InputComplex64)(Cookie cookie, double z[2]) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputComplex64")) {
     return false;
   }
@@ -1293,7 +1299,7 @@ bool IODEF(InputComplex64)(Cookie cookie, double z[2]) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(OutputCharacter)(
+bool IONAME(OutputCharacter)(
     Cookie cookie, const char *x, std::size_t length, int kind) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputCharacter")) {
     return false;
@@ -1305,11 +1311,11 @@ bool IODEF(OutputCharacter)(
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(OutputAscii)(Cookie cookie, const char *x, std::size_t length) {
+bool IONAME(OutputAscii)(Cookie cookie, const char *x, std::size_t length) {
   return IONAME(OutputCharacter(cookie, x, length, 1));
 }
 
-bool IODEF(InputCharacter)(
+bool IONAME(InputCharacter)(
     Cookie cookie, char *x, std::size_t length, int kind) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputCharacter")) {
     return false;
@@ -1320,11 +1326,11 @@ bool IODEF(InputCharacter)(
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(InputAscii)(Cookie cookie, char *x, std::size_t length) {
+bool IONAME(InputAscii)(Cookie cookie, char *x, std::size_t length) {
   return IONAME(InputCharacter)(cookie, x, length, 1);
 }
 
-bool IODEF(OutputLogical)(Cookie cookie, bool truth) {
+bool IONAME(OutputLogical)(Cookie cookie, bool truth) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputLogical")) {
     return false;
   }
@@ -1335,7 +1341,7 @@ bool IODEF(OutputLogical)(Cookie cookie, bool truth) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(InputLogical)(Cookie cookie, bool &truth) {
+bool IONAME(InputLogical)(Cookie cookie, bool &truth) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputLogical")) {
     return false;
   }
@@ -1346,17 +1352,17 @@ bool IODEF(InputLogical)(Cookie cookie, bool &truth) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(OutputDerivedType)(Cookie cookie, const Descriptor &descriptor,
+bool IONAME(OutputDerivedType)(Cookie cookie, const Descriptor &descriptor,
     const NonTbpDefinedIoTable *table) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor, table);
 }
 
-bool IODEF(InputDerivedType)(Cookie cookie, const Descriptor &descriptor,
+bool IONAME(InputDerivedType)(Cookie cookie, const Descriptor &descriptor,
     const NonTbpDefinedIoTable *table) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor, table);
 }
 
-std::size_t IODEF(GetSize)(Cookie cookie) {
+std::size_t IONAME(GetSize)(Cookie cookie) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (!handler.InError()) {
@@ -1373,7 +1379,7 @@ std::size_t IODEF(GetSize)(Cookie cookie) {
   return 0;
 }
 
-std::size_t IODEF(GetIoLength)(Cookie cookie) {
+std::size_t IONAME(GetIoLength)(Cookie cookie) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (!handler.InError()) {
@@ -1389,7 +1395,7 @@ std::size_t IODEF(GetIoLength)(Cookie cookie) {
   return 0;
 }
 
-void IODEF(GetIoMsg)(Cookie cookie, char *msg, std::size_t length) {
+void IONAME(GetIoMsg)(Cookie cookie, char *msg, std::size_t length) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (!handler.InError()) {
@@ -1400,7 +1406,7 @@ void IODEF(GetIoMsg)(Cookie cookie, char *msg, std::size_t length) {
   }
 }
 
-AsynchronousId IODEF(GetAsynchronousId)(Cookie cookie) {
+AsynchronousId IONAME(GetAsynchronousId)(Cookie cookie) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (auto *ext{io.get_if<ExternalIoStatementBase>()}) {
@@ -1413,24 +1419,24 @@ AsynchronousId IODEF(GetAsynchronousId)(Cookie cookie) {
   return 0;
 }
 
-bool IODEF(InquireCharacter)(Cookie cookie, InquiryKeywordHash inquiry,
+bool IONAME(InquireCharacter)(Cookie cookie, InquiryKeywordHash inquiry,
     char *result, std::size_t length) {
   IoStatementState &io{*cookie};
   return io.Inquire(inquiry, result, length);
 }
 
-bool IODEF(InquireLogical)(
+bool IONAME(InquireLogical)(
     Cookie cookie, InquiryKeywordHash inquiry, bool &result) {
   IoStatementState &io{*cookie};
   return io.Inquire(inquiry, result);
 }
 
-bool IODEF(InquirePendingId)(Cookie cookie, AsynchronousId id, bool &result) {
+bool IONAME(InquirePendingId)(Cookie cookie, AsynchronousId id, bool &result) {
   IoStatementState &io{*cookie};
   return io.Inquire(HashInquiryKeyword("PENDING"), id, result);
 }
 
-bool IODEF(InquireInteger64)(
+bool IONAME(InquireInteger64)(
     Cookie cookie, InquiryKeywordHash inquiry, std::int64_t &result, int kind) {
   IoStatementState &io{*cookie};
   std::int64_t n{0}; // safe "undefined" value
@@ -1446,15 +1452,17 @@ bool IODEF(InquireInteger64)(
   return false;
 }
 
+RT_EXT_API_GROUP_BEGIN
 enum Iostat IODEF(EndIoStatement)(Cookie cookie) {
   IoStatementState &io{*cookie};
   return static_cast<enum Iostat>(io.EndIoStatement());
 }
+RT_EXT_API_GROUP_END
 
 template <typename INT>
-static RT_API_ATTRS enum Iostat CheckUnitNumberInRangeImpl(INT unit,
-    bool handleError, char *ioMsg, std::size_t ioMsgLength,
-    const char *sourceFile, int sourceLine) {
+static enum Iostat CheckUnitNumberInRangeImpl(INT unit, bool handleError,
+    char *ioMsg, std::size_t ioMsgLength, const char *sourceFile,
+    int sourceLine) {
   static_assert(sizeof(INT) >= sizeof(ExternalUnit),
       "only intended to be used when the INT to ExternalUnit conversion is "
       "narrowing");
@@ -1486,15 +1494,15 @@ static RT_API_ATTRS enum Iostat CheckUnitNumberInRangeImpl(INT unit,
   return IostatOk;
 }
 
-enum Iostat IODEF(CheckUnitNumberInRange64)(std::int64_t unit, bool handleError,
-    char *ioMsg, std::size_t ioMsgLength, const char *sourceFile,
-    int sourceLine) {
+enum Iostat IONAME(CheckUnitNumberInRange64)(std::int64_t unit,
+    bool handleError, char *ioMsg, std::size_t ioMsgLength,
+    const char *sourceFile, int sourceLine) {
   return CheckUnitNumberInRangeImpl(
       unit, handleError, ioMsg, ioMsgLength, sourceFile, sourceLine);
 }
 
 #ifdef __SIZEOF_INT128__
-enum Iostat IODEF(CheckUnitNumberInRange128)(common::int128_t unit,
+enum Iostat IONAME(CheckUnitNumberInRange128)(common::int128_t unit,
     bool handleError, char *ioMsg, std::size_t ioMsgLength,
     const char *sourceFile, int sourceLine) {
   return CheckUnitNumberInRangeImpl(
@@ -1517,5 +1525,3 @@ void std::__libcpp_verbose_abort(char const *format, ...) {
   std::abort();
 }
 #endif
-
-RT_EXT_API_GROUP_END
diff --git a/flang/runtime/io-error.cpp b/flang/runtime/io-error.cpp
index 7a90966f8104..b006b82f6224 100644
--- a/flang/runtime/io-error.cpp
+++ b/flang/runtime/io-error.cpp
@@ -109,6 +109,8 @@ void IoErrorHandler::SignalPendingError() {
   SignalError(error);
 }
 
+RT_OFFLOAD_API_GROUP_END
+
 void IoErrorHandler::SignalErrno() { SignalError(errno); }
 
 bool IoErrorHandler::GetIoMsg(char *buffer, std::size_t bufferLength) {
@@ -125,10 +127,7 @@ bool IoErrorHandler::GetIoMsg(char *buffer, std::size_t bufferLength) {
   // in LLVM v9.0.1 with inadequate modification for Fortran,
   // since rectified.
   bool ok{false};
-#if defined(RT_DEVICE_COMPILATION)
-  // strerror_r is not available on device.
-  msg = "errno description is not available on device";
-#elif HAVE_STRERROR_R
+#if HAVE_STRERROR_R
   // strerror_r is thread-safe.
 #if defined(__GLIBC__) && defined(_GNU_SOURCE)
   // glibc defines its own incompatible version of strerror_r
@@ -158,6 +157,4 @@ bool IoErrorHandler::GetIoMsg(char *buffer, std::size_t bufferLength) {
     return false;
   }
 }
-
-RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/io-error.h b/flang/runtime/io-error.h
index 426573e2faf0..0fe11c9185c0 100644
--- a/flang/runtime/io-error.h
+++ b/flang/runtime/io-error.h
@@ -61,7 +61,7 @@ public:
   RT_API_ATTRS void SignalPendingError();
 
   RT_API_ATTRS int GetIoStat() const { return ioStat_; }
-  RT_API_ATTRS bool GetIoMsg(char *, std::size_t);
+  bool GetIoMsg(char *, std::size_t);
 
 private:
   enum Flag : std::uint8_t {
diff --git a/flang/runtime/lock.h b/flang/runtime/lock.h
index 9f27a8295c46..46ca28703a45 100644
--- a/flang/runtime/lock.h
+++ b/flang/runtime/lock.h
@@ -25,9 +25,7 @@
 #if USE_PTHREADS
 #include <pthread.h>
 #elif defined(_WIN32)
-// Do not define macros for "min" and "max"
-#define NOMINMAX
-#include <windows.h>
+#include "flang/Common/windows-include.h"
 #else
 #include <mutex>
 #endif
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index b9eed2101ecf..b502d41a8d5c 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -17,20 +17,16 @@
 
 namespace Fortran::runtime::io {
 
-RT_VAR_GROUP_BEGIN
 // Max size of a group, symbol or component identifier that can appear in
 // NAMELIST input, plus a byte for NUL termination.
-static constexpr RT_CONST_VAR_ATTRS std::size_t nameBufferSize{201};
-RT_VAR_GROUP_END
+static constexpr std::size_t nameBufferSize{201};
 
-RT_OFFLOAD_API_GROUP_BEGIN
-
-static inline RT_API_ATTRS char32_t GetComma(IoStatementState &io) {
+static inline char32_t GetComma(IoStatementState &io) {
   return io.mutableModes().editingFlags & decimalComma ? char32_t{';'}
                                                        : char32_t{','};
 }
 
-bool IODEF(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
+bool IONAME(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
   IoStatementState &io{*cookie};
   io.CheckFormattedStmtType<Direction::Output>("OutputNamelist");
   io.mutableModes().inNamelist = true;
@@ -44,8 +40,7 @@ bool IODEF(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
     if ((connection.NeedAdvance(prefixLen) &&
             !(io.AdvanceRecord() && EmitAscii(io, " ", 1))) ||
         !EmitAscii(io, prefix, prefixLen) ||
-        (connection.NeedAdvance(
-             Fortran::runtime::strlen(str) + (suffix != ' ')) &&
+        (connection.NeedAdvance(std::strlen(str) + (suffix != ' ')) &&
             !(io.AdvanceRecord() && EmitAscii(io, " ", 1)))) {
       return false;
     }
@@ -89,20 +84,20 @@ bool IODEF(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
   return EmitUpperCase("/", 1, "", ' ');
 }
 
-static constexpr RT_API_ATTRS bool IsLegalIdStart(char32_t ch) {
+static constexpr bool IsLegalIdStart(char32_t ch) {
   return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' ||
       ch == '@';
 }
 
-static constexpr RT_API_ATTRS bool IsLegalIdChar(char32_t ch) {
+static constexpr bool IsLegalIdChar(char32_t ch) {
   return IsLegalIdStart(ch) || (ch >= '0' && ch <= '9');
 }
 
-static constexpr RT_API_ATTRS char NormalizeIdChar(char32_t ch) {
+static constexpr char NormalizeIdChar(char32_t ch) {
   return static_cast<char>(ch >= 'A' && ch <= 'Z' ? ch - 'A' + 'a' : ch);
 }
 
-static RT_API_ATTRS bool GetLowerCaseName(
+static bool GetLowerCaseName(
     IoStatementState &io, char buffer[], std::size_t maxLength) {
   std::size_t byteLength{0};
   if (auto ch{io.GetNextNonBlank(byteLength)}) {
@@ -124,7 +119,7 @@ static RT_API_ATTRS bool GetLowerCaseName(
   return false;
 }
 
-static RT_API_ATTRS Fortran::common::optional<SubscriptValue> GetSubscriptValue(
+static Fortran::common::optional<SubscriptValue> GetSubscriptValue(
     IoStatementState &io) {
   Fortran::common::optional<SubscriptValue> value;
   std::size_t byteCount{0};
@@ -157,8 +152,8 @@ static RT_API_ATTRS Fortran::common::optional<SubscriptValue> GetSubscriptValue(
   return value;
 }
 
-static RT_API_ATTRS bool HandleSubscripts(IoStatementState &io,
-    Descriptor &desc, const Descriptor &source, const char *name) {
+static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
+    const Descriptor &source, const char *name) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   // Allow for blanks in subscripts; they're nonstandard, but not
   // ambiguous within the parentheses.
@@ -257,7 +252,7 @@ static RT_API_ATTRS bool HandleSubscripts(IoStatementState &io,
   return false;
 }
 
-static RT_API_ATTRS void StorageSequenceExtension(
+static void StorageSequenceExtension(
     Descriptor &desc, const Descriptor &source) {
   // Support the near-universal extension of NAMELIST input into a
   // designatable storage sequence identified by its initial scalar array
@@ -279,7 +274,7 @@ static RT_API_ATTRS void StorageSequenceExtension(
   }
 }
 
-static RT_API_ATTRS bool HandleSubstring(
+static bool HandleSubstring(
     IoStatementState &io, Descriptor &desc, const char *name) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   auto pair{desc.type().GetCategoryAndKind()};
@@ -340,7 +335,7 @@ static RT_API_ATTRS bool HandleSubstring(
   return false;
 }
 
-static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
+static bool HandleComponent(IoStatementState &io, Descriptor &desc,
     const Descriptor &source, const char *name) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   char compName[nameBufferSize];
@@ -349,8 +344,7 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
     if (const typeInfo::DerivedType *
         type{addendum ? addendum->derivedType() : nullptr}) {
       if (const typeInfo::Component *
-          comp{type->FindDataComponent(
-              compName, Fortran::runtime::strlen(compName))}) {
+          comp{type->FindDataComponent(compName, std::strlen(compName))}) {
         bool createdDesc{false};
         if (comp->rank() > 0 && source.rank() > 0) {
           // If base and component are both arrays, the component name
@@ -414,7 +408,7 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
 
 // Advance to the terminal '/' of a namelist group or leading '&'/'$'
 // of the next.
-static RT_API_ATTRS void SkipNamelistGroup(IoStatementState &io) {
+static void SkipNamelistGroup(IoStatementState &io) {
   std::size_t byteCount{0};
   while (auto ch{io.GetNextNonBlank(byteCount)}) {
     io.HandleRelativePosition(byteCount);
@@ -437,7 +431,7 @@ static RT_API_ATTRS void SkipNamelistGroup(IoStatementState &io) {
   }
 }
 
-bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
+bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   IoStatementState &io{*cookie};
   io.CheckFormattedStmtType<Direction::Input>("InputNamelist");
   io.mutableModes().inNamelist = true;
@@ -476,7 +470,7 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
       handler.SignalError("NAMELIST input group has no name");
       return false;
     }
-    if (Fortran::runtime::strcmp(group.groupName, name) == 0) {
+    if (std::strcmp(group.groupName, name) == 0) {
       break; // found it
     }
     SkipNamelistGroup(io);
@@ -495,7 +489,7 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
     }
     std::size_t itemIndex{0};
     for (; itemIndex < group.items; ++itemIndex) {
-      if (Fortran::runtime::strcmp(name, group.item[itemIndex].name) == 0) {
+      if (std::strcmp(name, group.item[itemIndex].name) == 0) {
         break;
       }
     }
@@ -596,6 +590,8 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   return true;
 }
 
+RT_OFFLOAD_API_GROUP_BEGIN
+
 bool IsNamelistNameOrSlash(IoStatementState &io) {
   if (auto *listInput{
           io.get_if<ListDirectedStatementState<Direction::Input>>()}) {
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-as b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-as
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-as
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-ld b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-ld.bfd b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-ld.bfd
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-ld.bfd
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-ld.gold b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-ld.gold
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/i386-unknown-linux-gnu-ld.gold
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-as b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-as
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-as
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-ld b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-ld.bfd b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-ld.bfd
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-ld.bfd
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-ld.gold b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-ld.gold
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/bin/x86_64-unknown-linux-gnu-ld.gold
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/as b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/as
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/as
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/ld b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/ld.bfd b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/ld.bfd
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/ld.bfd
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/ld.gold b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/ld.gold
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/bin/ld.gold
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/lib/.keep b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/lib/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/i386-unknown-linux-gnu/lib/.keep
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/crtbegin.o b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/crtbegin.o
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/crtbegin.o b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/crtbegin.o
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/crtbeginT.o b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/crtbeginT.o
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/crtbeginT.o
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/crtfastmath.o b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/crtfastmath.o
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/crtfastmath.o
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/x32/crtbegin.o b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/x32/crtbegin.o
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/x32/crtbegin.o
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/x32/crtbeginT.o b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/x32/crtbeginT.o
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/x32/crtbeginT.o
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/x32/crtfastmath.o b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/x32/crtfastmath.o
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/x32/crtfastmath.o
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/as b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/as
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/as
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld.bfd b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld.bfd
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld.bfd
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld.gold b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld.gold
new file mode 100755
index 000000000000..b23e55619b2f
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld.gold
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld.lld b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld.lld
new file mode 100755
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/bin/ld.lld
diff --git a/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/lib/.keep b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/lib/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/flang/test/Driver/Inputs/basic_cross_linux_tree/usr/x86_64-unknown-linux-gnu/lib/.keep
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index bf3660d57cbb..4405b64a8d00 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -104,6 +104,9 @@
 ! CHECK-NEXT: -fversion-loops-for-stride
 ! CHECK-NEXT:                         Create unit-strided versions of loops
 ! CHECK-NEXT: -fxor-operator          Enable .XOR. as a synonym of .NEQV.
+! CHECK-NEXT: --gcc-install-dir=<value>
+! CHECK-NEXT:                         Use GCC installation in the specified directory. The directory ends with path components like 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Note: executables (e.g. ld) used by the compiler are not overridden by the selected GCC installation
+! CHECK-NEXT: --gcc-toolchain=<value> Specify a directory where Clang can find 'include' and 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Clang will use the GCC installation with the largest version
 ! CHECK-NEXT: -gline-directives-only  Emit debug line info directives only
 ! CHECK-NEXT: -gline-tables-only      Emit debug line number tables only
 ! CHECK-NEXT: -gpulibc                Link the LLVM C Library for GPUs
diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90
index b4280a454e31..c80453f0ef21 100644
--- a/flang/test/Driver/driver-help.f90
+++ b/flang/test/Driver/driver-help.f90
@@ -92,6 +92,9 @@
 ! HELP-NEXT: -fversion-loops-for-stride
 ! HELP-NEXT:                         Create unit-strided versions of loops
 ! HELP-NEXT: -fxor-operator          Enable .XOR. as a synonym of .NEQV.
+! HELP-NEXT: --gcc-install-dir=<value>
+! HELP-NEXT:                         Use GCC installation in the specified directory. The directory ends with path components like 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Note: executables (e.g. ld) used by the compiler are not overridden by the selected GCC installation
+! HELP-NEXT: --gcc-toolchain=<value> Specify a directory where Clang can find 'include' and 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Clang will use the GCC installation with the largest version
 ! HELP-NEXT: -gline-directives-only  Emit debug line info directives only
 ! HELP-NEXT: -gline-tables-only      Emit debug line number tables only
 ! HELP-NEXT: -gpulibc                Link the LLVM C Library for GPUs
diff --git a/flang/test/Driver/gcc-toolchain-install-dir.f90 b/flang/test/Driver/gcc-toolchain-install-dir.f90
new file mode 100644
index 000000000000..5a073b0c5171
--- /dev/null
+++ b/flang/test/Driver/gcc-toolchain-install-dir.f90
@@ -0,0 +1,21 @@
+!! Test that --gcc-toolchain and --gcc-install-dir options are working as expected.
+!! It does not test cross-compiling (--sysroot), so crtbegin.o, libgcc/compiler-rt, libc, libFortranRuntime, etc. are not supposed to be affected.
+!! PREFIX is captured twice because the driver escapes backslashes (occuring in Windows paths) in the -### output, but not on the "Selected GCC installation:" line.
+
+! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=i386-unknown-linux-gnu --gcc-install-dir=%S/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0 | FileCheck %s --check-prefix=CHECK-I386
+! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=i386-unknown-linux-gnu --gcc-toolchain=%S/Inputs/basic_cross_linux_tree/usr                                         | FileCheck %s --check-prefix=CHECK-I386
+! CHECK-I386:      Selected GCC installation: [[PREFIX:[^"]+]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0
+! CHECK-I386:      "-fc1" "-triple" "i386-unknown-linux-gnu" 
+! CHECK-I386:      "[[PREFIX:[^"]+]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/../../../../i386-unknown-linux-gnu/bin{{/|\\\\}}as"
+! CHECK-I386:      "[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/../../../../i386-unknown-linux-gnu/bin{{/|\\\\}}ld" {{.*}} "-m" "elf_i386"
+! CHECK-I386-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0" 
+! CHECK-I386-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0/../../../../i386-unknown-linux-gnu/lib"
+
+! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=x86_64-unknown-linux-gnu --gcc-install-dir=%S/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0 | FileCheck %s --check-prefix=CHECK-X86-64
+! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=x86_64-unknown-linux-gnu --gcc-toolchain=%S/Inputs/basic_cross_linux_tree/usr                                           | FileCheck %s --check-prefix=CHECK-X86-64
+! CHECK-X86-64:      Selected GCC installation: [[PREFIX:[^"]+]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0
+! CHECK-X86-64:      "-fc1" "-triple" "x86_64-unknown-linux-gnu"
+! CHECK-X86-64:      "[[PREFIX:[^"]+]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/../../../../x86_64-unknown-linux-gnu/bin{{/|\\\\}}as" "--64"
+! CHECK-X86-64:      "[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/../../../../x86_64-unknown-linux-gnu/bin{{/|\\\\}}ld" {{.*}} "-m" "elf_x86_64"
+! CHECK-X86-64-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0" 
+! CHECK-X86-64-SAME: "-L[[PREFIX]]/Inputs/basic_cross_linux_tree/usr/lib/gcc/x86_64-unknown-linux-gnu/10.2.0/../../../../x86_64-unknown-linux-gnu/lib"
diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
index 5df794434844..155ce8fb55f2 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-iface.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
@@ -133,9 +133,20 @@ end subroutine
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_opt_assumed_rank(%[[VAL_11]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
 
-! TODO: set assumed size last extent to -1.
-!subroutine int_r2_assumed_size_to_assumed_rank(x)
-!  use ifaces, only : int_assumed_rank
-!  integer :: x(10, *)
-!  call int_assumed_rank(x)
-!end subroutine
+subroutine int_r2_assumed_size_to_assumed_rank(x)
+  use ifaces, only : int_assumed_rank
+  integer :: x(10, *)
+  call int_assumed_rank(x)
+end subroutine
+! CHECK-LABEL:   func.func @_QPint_r2_assumed_size_to_assumed_rank(
+! CHECK-SAME:                                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<10x?xi32>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : i64
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i64) -> index
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]] = arith.cmpi sgt, %[[VAL_2]], %[[VAL_3]] : index
+! CHECK:           %[[VAL_5:.*]] = arith.select %[[VAL_4]], %[[VAL_2]], %[[VAL_3]] : index
+! CHECK:           %[[VAL_6:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]], %[[VAL_6]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>)
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.box<!fir.array<10x?xi32>>) -> !fir.box<!fir.array<*:i32>>
+! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_9]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
index 735a99854308..56dcabbb75c3 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
@@ -50,7 +50,7 @@ end program
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
-! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#1(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
 ! CHECK:           fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
 ! CHECK:           omp.parallel byref reduction(@add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
index 4834047a98a4..94bff410a2f0 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
@@ -50,7 +50,7 @@ end program
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
-! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#1(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
 ! CHECK:           fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
 ! CHECK:           omp.parallel byref reduction(@add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90
new file mode 100644
index 000000000000..b25759713e31
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90
@@ -0,0 +1,125 @@
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! The script is designed to make adding checks to
+! a test case fast, it is *not* designed to be authoritative
+! about what constitutes a good test! The CHECK should be
+! minimized and named to reflect the test intent.
+
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+
+
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.array<?xi32>, %[[VAL_4]]#1 {bindc_name = ".tmp"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+! CHECK:           hlfir.assign %[[VAL_1]] to %[[VAL_7]]#0 : i32, !fir.box<!fir.array<?xi32>>
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+! CHECK:           fir.store %[[VAL_7]]#0 to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>):
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]]#0, %[[VAL_5]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[VAL_7]] to %[[VAL_5]]#1 step %[[VAL_7]] unordered {
+! CHECK:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<?xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<?xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
+! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
+! CHECK:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
+! CHECK:           }
+! CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPs(
+! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsEi"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i64) -> index
+! CHECK:           %[[VAL_7:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_8:.*]] = arith.cmpi sgt, %[[VAL_6]], %[[VAL_7]] : index
+! CHECK:           %[[VAL_9:.*]] = arith.select %[[VAL_8]], %[[VAL_6]], %[[VAL_7]] : index
+! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.array<?xi32>, %[[VAL_9]] {bindc_name = "c", uniq_name = "_QFsEc"}
+! CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_11]]) {uniq_name = "_QFsEc"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_12]]#0 : i32, !fir.box<!fir.array<?xi32>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_17:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_19:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+! CHECK:             fir.store %[[VAL_12]]#0 to %[[VAL_19]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_box_Uxi32 %[[VAL_19]] -> %[[VAL_20:.*]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)  for  (%[[VAL_21:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
+! CHECK:               fir.store %[[VAL_21]] to %[[VAL_15]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFsEc"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> (!fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.array<?xi32>>>)
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = arith.constant 0 : index
+! CHECK:               %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_23]], %[[VAL_25]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:               %[[VAL_27:.*]] = fir.shape %[[VAL_26]]#1 : (index) -> !fir.shape<1>
+! CHECK:               %[[VAL_28:.*]] = hlfir.elemental %[[VAL_27]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+! CHECK:               ^bb0(%[[VAL_29:.*]]: index):
+! CHECK:                 %[[VAL_30:.*]] = arith.constant 0 : index
+! CHECK:                 %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_23]], %[[VAL_30]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:                 %[[VAL_32:.*]] = arith.constant 1 : index
+! CHECK:                 %[[VAL_33:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_32]] : index
+! CHECK:                 %[[VAL_34:.*]] = arith.addi %[[VAL_29]], %[[VAL_33]] : index
+! CHECK:                 %[[VAL_35:.*]] = hlfir.designate %[[VAL_23]] (%[[VAL_34]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK:                 %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<i32>
+! CHECK:                 %[[VAL_37:.*]] = arith.addi %[[VAL_36]], %[[VAL_24]] : i32
+! CHECK:                 hlfir.yield_element %[[VAL_37]] : i32
+! CHECK:               }
+! CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:               hlfir.assign %[[VAL_28]] to %[[VAL_38]] : !hlfir.expr<?xi32>, !fir.box<!fir.array<?xi32>>
+! CHECK:               hlfir.destroy %[[VAL_28]] : !hlfir.expr<?xi32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           %[[VAL_39:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_40:.*]] = hlfir.designate %[[VAL_12]]#0 (%[[VAL_39]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK:           %[[VAL_41:.*]] = fir.load %[[VAL_40]] : !fir.ref<i32>
+! CHECK:           %[[VAL_42:.*]] = arith.constant 5050 : i32
+! CHECK:           %[[VAL_43:.*]] = arith.cmpi ne, %[[VAL_41]], %[[VAL_42]] : i32
+! CHECK:           cf.cond_br %[[VAL_43]], ^bb1, ^bb2
+! CHECK:         ^bb1:
+! CHECK:           %[[VAL_44:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_45:.*]] = arith.constant false
+! CHECK:           %[[VAL_46:.*]] = arith.constant false
+! CHECK:           %[[VAL_47:.*]] = fir.call @_FortranAStopStatement(%[[VAL_44]], %[[VAL_45]], %[[VAL_46]]) fastmath<contract> : (i32, i1, i1) -> none
+! CHECK:           fir.unreachable
+! CHECK:         ^bb2:
+! CHECK:           return
+! CHECK:         }
+! CHECK:         func.func private @_FortranAStopStatement(i32, i1, i1) -> none attributes {fir.runtime}
+
+subroutine s(x)
+    integer :: x
+    integer :: c(x)
+    c = 0
+    !$omp parallel do reduction(+:c)
+    do i = 1, 100
+        c = c + i
+    end do
+    !$omp end parallel do
+
+    if (c(1) /= 5050) stop 1
+end subroutine s
+\ No newline at end of file
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
new file mode 100644
index 000000000000..a1f339faea5c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
@@ -0,0 +1,90 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+program reduce_assumed_shape
+real(8), dimension(2) :: r
+r = 0
+call reduce(r)
+print *, r
+
+contains
+subroutine reduce(r)
+  implicit none
+  real(8),intent(inout) :: r(:)
+  integer :: i = 0
+
+  !$omp parallel do reduction(+:r)
+  do i=0,10
+    r(1) = i
+    r(2) = 1
+  enddo
+  !$omp end parallel do
+end subroutine
+end program
+
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref<!fir.box<!fir.array<?xf64>>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf64>>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box<!fir.array<?xf64>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.array<?xf64>, %[[VAL_4]]#1 {bindc_name = ".tmp"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
+! CHECK:           hlfir.assign %[[VAL_1]] to %[[VAL_7]]#0 : f64, !fir.box<!fir.array<?xf64>>
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
+! CHECK:           fir.store %[[VAL_7]]#0 to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf64>>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf64>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<?xf64>>>):
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box<!fir.array<?xf64>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]]#0, %[[VAL_5]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[VAL_7]] to %[[VAL_5]]#1 step %[[VAL_7]] unordered {
+! CHECK:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<?xf64>>, !fir.shapeshift<1>, index) -> !fir.ref<f64>
+! CHECK:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<?xf64>>, !fir.shapeshift<1>, index) -> !fir.ref<f64>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<f64>
+! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<f64>
+! CHECK:             %[[VAL_13:.*]] = arith.addf %[[VAL_11]], %[[VAL_12]] fastmath<contract> : f64
+! CHECK:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<f64>
+! CHECK:           }
+! CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf64>>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func private @_QFPreduce(
+! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "r"}) attributes {{.*}} {
+! CHECK:           %[[VAL_1:.*]] = fir.address_of(@_QFFreduceEi) : !fir.ref<i32>
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.box<!fir.array<?xf64>>) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_4:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
+! CHECK:             fir.store %[[VAL_3]]#1 to %[[VAL_9]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_box_Uxf64 %[[VAL_9]] -> %[[VAL_10:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_5]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.ref<!fir.box<!fir.array<?xf64>>>) -> (!fir.ref<!fir.box<!fir.array<?xf64>>>, !fir.ref<!fir.box<!fir.array<?xf64>>>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> f64
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:               %[[VAL_16:.*]] = arith.constant 1 : index
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_15]] (%[[VAL_16]])  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+! CHECK:               hlfir.assign %[[VAL_14]] to %[[VAL_17]] : f64, !fir.ref<f64>
+! CHECK:               %[[VAL_18:.*]] = arith.constant 1.000000e+00 : f64
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:               %[[VAL_20:.*]] = arith.constant 2 : index
+! CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_19]] (%[[VAL_20]])  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+! CHECK:               hlfir.assign %[[VAL_18]] to %[[VAL_21]] : f64, !fir.ref<f64>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
index a20ed1ca83ce..a898204c881d 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
@@ -60,7 +60,7 @@ end program
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_5]]#1(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_5]]#0(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
 ! CHECK:             %[[VAL_12:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
 ! CHECK:             fir.store %[[VAL_11]] to %[[VAL_12]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
 ! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_box_2xi32 %[[VAL_12]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)  for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
index 61599876da8e..f3745c846091 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
@@ -60,7 +60,7 @@ end program
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_5]]#1(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_5]]#0(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
 ! CHECK:             %[[VAL_12:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
 ! CHECK:             fir.store %[[VAL_11]] to %[[VAL_12]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
 ! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_box_2xi32 %[[VAL_12]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)  for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
diff --git a/flang/test/Lower/stop-statement.f90 b/flang/test/Lower/stop-statement.f90
index bc94a7ee23a6..cf0665cf5dbd 100644
--- a/flang/test/Lower/stop-statement.f90
+++ b/flang/test/Lower/stop-statement.f90
@@ -21,10 +21,10 @@ end subroutine
 ! CHECK-LABEL: stop_error
 subroutine stop_error()
   error stop
- ! CHECK-DAG: %[[c0:.*]] = arith.constant 0 : i32
+ ! CHECK-DAG: %[[c_1:.*]] = arith.constant 1 : i32
  ! CHECK-DAG: %[[true:.*]] = arith.constant true
  ! CHECK-DAG: %[[false:.*]] = arith.constant false
- ! CHECK: fir.call @_Fortran{{.*}}StopStatement(%[[c0]], %[[true]], %[[false]])
+ ! CHECK: fir.call @_Fortran{{.*}}StopStatement(%[[c_1]], %[[true]], %[[false]])
  ! CHECK-NEXT: fir.unreachable
 end subroutine
 
diff --git a/flang/test/Semantics/cuf03.cuf b/flang/test/Semantics/cuf03.cuf
index 41bfbb767813..7384a104831d 100644
--- a/flang/test/Semantics/cuf03.cuf
+++ b/flang/test/Semantics/cuf03.cuf
@@ -51,7 +51,8 @@ module m
  contains
   attributes(device) subroutine devsubr(n,da)
     integer, intent(in) :: n
-    real, device :: da(*) ! ok
+    !ERROR: Object 'da' with ATTRIBUTES(DEVICE) may not be assumed size
+    real, device :: da(*)
     real, managed :: ma(n) ! ok
     !WARNING: Pointer 'dp' may not be associated in a device subprogram
     real, device, pointer :: dp
diff --git a/flang/unittests/Runtime/Time.cpp b/flang/unittests/Runtime/Time.cpp
index ec0caa743bcd..5c93282bca61 100644
--- a/flang/unittests/Runtime/Time.cpp
+++ b/flang/unittests/Runtime/Time.cpp
@@ -12,7 +12,7 @@
 #include "flang/Runtime/time-intrinsic.h"
 #include <algorithm>
 #include <cctype>
-#include <charconv>
+#include <cerrno>
 #include <string>
 
 using namespace Fortran::runtime;
@@ -104,10 +104,9 @@ TEST(TimeIntrinsics, DateAndTime) {
     EXPECT_TRUE(true);
   } else {
     count_t number{-1};
-    auto [_, ec]{
-        std::from_chars(date.data(), date.data() + date.size(), number)};
-    ASSERT_TRUE(ec != std::errc::invalid_argument &&
-        ec != std::errc::result_out_of_range);
+    // Use stol to allow GCC 7.5 to build tests
+    number = std::stol(date);
+    ASSERT_TRUE(errno != ERANGE);
     EXPECT_GE(number, 0);
     auto year = number / 10000;
     auto month = (number - year * 10000) / 100;
@@ -121,14 +120,15 @@ TEST(TimeIntrinsics, DateAndTime) {
   }
 
   // Validate time is hhmmss.sss or blank.
+  std::string acceptedPattern("hhmmss.sss");
   if (isBlank(time)) {
     EXPECT_TRUE(true);
   } else {
     count_t number{-1};
-    auto [next, ec]{
-        std::from_chars(time.data(), time.data() + date.size(), number)};
-    ASSERT_TRUE(ec != std::errc::invalid_argument &&
-        ec != std::errc::result_out_of_range);
+    // Use stol to allow GCC 7.5 to build tests
+    auto dotPosition = acceptedPattern.find('.');
+    number = std::stol(time.substr(0, dotPosition));
+    ASSERT_TRUE(errno != ERANGE);
     ASSERT_GE(number, 0);
     auto hours = number / 10000;
     auto minutes = (number - hours * 10000) / 100;
@@ -137,15 +137,11 @@ TEST(TimeIntrinsics, DateAndTime) {
     EXPECT_LE(minutes, 59);
     // Accept 60 for leap seconds.
     EXPECT_LE(seconds, 60);
-    ASSERT_TRUE(next != time.data() + time.size());
-    EXPECT_EQ(*next, '.');
+    EXPECT_EQ(time.substr(dotPosition, 1), ".");
 
     count_t milliseconds{-1};
-    ASSERT_TRUE(next + 1 != time.data() + time.size());
-    auto [_, ec2]{
-        std::from_chars(next + 1, time.data() + date.size(), milliseconds)};
-    ASSERT_TRUE(ec2 != std::errc::invalid_argument &&
-        ec2 != std::errc::result_out_of_range);
+    milliseconds = std::stol(time.substr(dotPosition + 1, 3));
+    ASSERT_TRUE(errno != ERANGE);
     EXPECT_GE(milliseconds, 0);
     EXPECT_LE(milliseconds, 999);
   }
@@ -157,10 +153,9 @@ TEST(TimeIntrinsics, DateAndTime) {
     ASSERT_TRUE(zone.size() > 1);
     EXPECT_TRUE(zone[0] == '+' || zone[0] == '-');
     count_t number{-1};
-    auto [next, ec]{
-        std::from_chars(zone.data() + 1, zone.data() + zone.size(), number)};
-    ASSERT_TRUE(ec != std::errc::invalid_argument &&
-        ec != std::errc::result_out_of_range);
+    // Use stol to allow GCC 7.5 to build tests
+    number = std::stol(zone.substr(1, 4));
+    ASSERT_TRUE(errno != ERANGE);
     ASSERT_GE(number, 0);
     auto hours = number / 100;
     auto minutes = number % 100;
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 40a1cfda060e..5b3a10d55fed 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -43,6 +43,7 @@ function(_get_common_compile_options output_var flags)
     list(APPEND compile_options "-fpie")
 
     if(LLVM_LIBC_FULL_BUILD)
+      list(APPEND compile_options "-DLIBC_FULL_BUILD")
       # Only add -ffreestanding flag in full build mode.
       list(APPEND compile_options "-ffreestanding")
     endif()
@@ -126,6 +127,7 @@ function(_get_common_test_compile_options output_var c_test flags)
     list(APPEND compile_options "-fpie")
 
     if(LLVM_LIBC_FULL_BUILD)
+      list(APPEND compile_options "-DLIBC_FULL_BUILD")
       # Only add -ffreestanding flag in full build mode.
       list(APPEND compile_options "-ffreestanding")
       list(APPEND compile_options "-fno-exceptions")
@@ -178,5 +180,10 @@ function(_get_hermetic_test_compile_options output_var flags)
          -Wno-multi-gpu --cuda-path=${LIBC_CUDA_ROOT}
          -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit)
   endif()
+
+  if(LLVM_LIBC_FULL_BUILD)
+    list(APPEND compile_options "-DLIBC_FULL_BUILD")
+  endif()
+  
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
diff --git a/libc/config/baremetal/api.td b/libc/config/baremetal/api.td
index 690edbda1311..25aa06aacb64 100644
--- a/libc/config/baremetal/api.td
+++ b/libc/config/baremetal/api.td
@@ -57,10 +57,7 @@ def MathAPI : PublicAPI<"math.h"> {
 }
 
 def StdIOAPI : PublicAPI<"stdio.h"> {
-  let Types = [
-    "size_t",
-    "off_t",
-  ];
+  let Types = ["size_t"];
 }
 
 def StdlibAPI : PublicAPI<"stdlib.h"> {
diff --git a/libc/config/gpu/api.td b/libc/config/gpu/api.td
index 523ad49ffa3f..adaf5bfd747a 100644
--- a/libc/config/gpu/api.td
+++ b/libc/config/gpu/api.td
@@ -64,11 +64,7 @@ def StdIOAPI : PublicAPI<"stdio.h"> {
     SimpleMacroDef<"_IOLBF", "1">,
     SimpleMacroDef<"_IONBF", "2">,
   ];
-  let Types = [
-    "FILE",
-    "off_t",
-    "size_t",
-  ];
+  let Types = ["size_t", "FILE"];
 }
 
 def IntTypesAPI : PublicAPI<"inttypes.h"> {
diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
index 9964971f191b..eb5ed8089850 100644
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -49,10 +49,7 @@ def CTypeAPI : PublicAPI<"ctype.h"> {
 }
 
 def FCntlAPI : PublicAPI<"fcntl.h"> {
-  let Types = [
-    "mode_t",
-    "off_t",
-  ];
+  let Types = ["mode_t"];
 }
 
 def IntTypesAPI : PublicAPI<"inttypes.h"> {
@@ -80,12 +77,7 @@ def StdIOAPI : PublicAPI<"stdio.h"> {
     SimpleMacroDef<"_IOLBF", "1">,
     SimpleMacroDef<"_IONBF", "2">,
   ];
-  let Types = [
-    "FILE",
-    "cookie_io_functions_t",
-    "off_t",
-    "size_t",
-  ];
+  let Types = ["size_t", "FILE", "cookie_io_functions_t"];
 }
 
 def StdlibAPI : PublicAPI<"stdlib.h"> {
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index cc7671cab081..2742c33ae478 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -370,6 +370,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
+    libc.src.math.exp2m1f
     libc.src.math.expm1
     libc.src.math.expm1f
     libc.src.math.fabs
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 265261be31a2..970a43ca87c9 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -270,7 +270,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | exp2      | |check|          | |check|         |                        |                      |                        | 7.12.6.4               | F.10.3.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| exp2m1    |                  |                 |                        |                      |                        | 7.12.6.5               | F.10.3.5                   |
+| exp2m1    | |check|          |                 |                        |                      |                        | 7.12.6.5               | F.10.3.5                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | expm1     | |check|          | |check|         |                        |                      |                        | 7.12.6.6               | F.10.3.6                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/fuzzing/CMakeLists.txt b/libc/fuzzing/CMakeLists.txt
index 82487688af11..816691b4bd44 100644
--- a/libc/fuzzing/CMakeLists.txt
+++ b/libc/fuzzing/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer")
 add_custom_target(libc-fuzzer)
 
+add_subdirectory(__support)
 # TODO(#85680): Re-enable math fuzzing after headers are sorted out
 # add_subdirectory(math)
 add_subdirectory(stdlib)
diff --git a/libc/fuzzing/__support/CMakeLists.txt b/libc/fuzzing/__support/CMakeLists.txt
new file mode 100644
index 000000000000..278e914e3fbe
--- /dev/null
+++ b/libc/fuzzing/__support/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_libc_fuzzer(
+  uint_fuzz
+  SRCS
+    uint_fuzz.cpp
+  DEPENDS
+    libc.src.__support.uint
+)
diff --git a/libc/fuzzing/__support/uint_fuzz.cpp b/libc/fuzzing/__support/uint_fuzz.cpp
new file mode 100644
index 000000000000..f48f00d3b4ba
--- /dev/null
+++ b/libc/fuzzing/__support/uint_fuzz.cpp
@@ -0,0 +1,70 @@
+#include "src/__support/CPP/bit.h"
+#include "src/__support/UInt.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Helper function when using gdb / lldb to set a breakpoint and inspect values.
+template <typename T> void debug_and_trap(const char *msg, T a, T b) {
+  __builtin_trap();
+}
+
+#define DEBUG_AND_TRAP()
+
+#define TEST_BINOP(OP)                                                         \
+  if ((a OP b) != (static_cast<T>(BigInt(a) OP BigInt(b))))                    \
+    debug_and_trap(#OP, a, b);
+
+#define TEST_SHIFTOP(OP)                                                       \
+  if ((a OP b) != (static_cast<T>(BigInt(a) OP b)))                            \
+    debug_and_trap(#OP, a, b);
+
+#define TEST_FUNCTION(FUN)                                                     \
+  if (FUN(a) != FUN(BigInt(a)))                                                \
+    debug_and_trap(#FUN, a, b);
+
+// Test that basic arithmetic operations of BigInt behave like their scalar
+// counterparts.
+template <typename T, typename BigInt> void run_tests(T a, T b) {
+  TEST_BINOP(+)
+  TEST_BINOP(-)
+  TEST_BINOP(*)
+  if (b != 0)
+    TEST_BINOP(/)
+  if (b >= 0 && b < cpp::numeric_limits<T>::digits) {
+    TEST_SHIFTOP(<<)
+    TEST_SHIFTOP(>>)
+  }
+  if constexpr (!BigInt::SIGNED) {
+    TEST_FUNCTION(cpp::has_single_bit)
+    TEST_FUNCTION(cpp::countr_zero)
+    TEST_FUNCTION(cpp::countl_zero)
+    TEST_FUNCTION(cpp::countl_one)
+    TEST_FUNCTION(cpp::countr_one)
+  }
+}
+
+// Reads a T from libfuzzer data.
+template <typename T> T read(const uint8_t *data, size_t &remainder) {
+  T out = 0;
+  constexpr size_t T_SIZE = sizeof(T);
+  const size_t copy_size = remainder < T_SIZE ? remainder : T_SIZE;
+  inline_memcpy(&out, data, copy_size);
+  remainder -= copy_size;
+  return out;
+}
+
+template <typename T, typename BigInt>
+void run_tests(const uint8_t *data, size_t size) {
+  const auto a = read<T>(data, size);
+  const auto b = read<T>(data, size);
+  run_tests<T, BigInt>(a, b);
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  // unsigned
+  run_tests<uint64_t, BigInt<64, false, uint16_t>>(data, size);
+  // signed
+  run_tests<int64_t, BigInt<64, true, uint16_t>>(data, size);
+  return 0;
+}
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 02c7dc8fbc0b..4203f0bc901b 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -41,10 +41,9 @@ add_gen_header(
   DEF_FILE fcntl.h.def
   GEN_HDR fcntl.h
   DEPENDS
+    .llvm_libc_common_h
     .llvm-libc-macros.fcntl_macros
     .llvm-libc-types.mode_t
-    .llvm-libc-types.off_t
-    .llvm_libc_common_h
 )
 
 add_gen_header(
@@ -265,14 +264,13 @@ add_gen_header(
   DEF_FILE stdio.h.def
   GEN_HDR stdio.h
   DEPENDS
+    .llvm_libc_common_h
     .llvm-libc-macros.file_seek_macros
     .llvm-libc-macros.stdio_macros
-    .llvm-libc-types.FILE
-    .llvm-libc-types.cookie_io_functions_t
-    .llvm-libc-types.off_t
     .llvm-libc-types.size_t
     .llvm-libc-types.ssize_t
-    .llvm_libc_common_h
+    .llvm-libc-types.FILE
+    .llvm-libc-types.cookie_io_functions_t
 )
 
 add_gen_header(
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index 1497e32044e9..6046ea98cb8a 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -9,6 +9,11 @@
 #ifndef LLVM_LIBC_MACROS_MATH_MACROS_H
 #define LLVM_LIBC_MACROS_MATH_MACROS_H
 
+// TODO: Remove this. This is a temporary fix for a downstream problem.
+// This cannot be left permanently since it would require downstream users to
+// define this macro.
+#ifdef LIBC_FULL_BUILD
+
 #include "limits-macros.h"
 
 #define FP_NAN 0
@@ -79,4 +84,10 @@ template <typename T> inline constexpr bool isnan(T x) {
 
 #endif
 
+#else // LIBC_FULL_BUILD
+
+#include <math.h>
+
+#endif // LIBC_FULL_BUILD
+
 #endif // LLVM_LIBC_MACROS_MATH_MACROS_H
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
index 45f7ecfe84e9..cfa8d3afedde 100644
--- a/libc/spec/posix.td
+++ b/libc/spec/posix.td
@@ -210,10 +210,7 @@ def POSIX : StandardSpec<"POSIX"> {
   HeaderSpec FCntl = HeaderSpec<
     "fcntl.h",
     [], // Macros
-    [
-        ModeTType,
-        OffTType,
-    ],
+    [ModeTType],
     [], // Enumerations
     [
       FunctionSpec<
@@ -1183,7 +1180,7 @@ def POSIX : StandardSpec<"POSIX"> {
   HeaderSpec StdIO = HeaderSpec<
       "stdio.h",
       [], // Macros
-      [OffTType], // Types
+      [], // Types
       [], // Enumerations
       [
           FunctionSpec<
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 719bb9aa18cb..bd62870b07c8 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -535,6 +535,8 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"exp2", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"exp2f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
+          FunctionSpec<"exp2m1f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+
           FunctionSpec<"expm1", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"expm1f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 73fd7381c3c8..e0c205f52383 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -58,9 +58,9 @@ template <size_t Bits> struct DyadicFloat {
   // significant bit.
   LIBC_INLINE constexpr DyadicFloat &normalize() {
     if (!mantissa.is_zero()) {
-      int shift_length = static_cast<int>(mantissa.clz());
+      int shift_length = cpp::countl_zero(mantissa);
       exponent -= shift_length;
-      mantissa.shift_left(static_cast<size_t>(shift_length));
+      mantissa <<= static_cast<size_t>(shift_length);
     }
     return *this;
   }
@@ -233,7 +233,7 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
     result.sign = a.sign;
     result.exponent = a.exponent;
     result.mantissa = a.mantissa;
-    if (result.mantissa.add(b.mantissa)) {
+    if (result.mantissa.add_overflow(b.mantissa)) {
       // Mantissa addition overflow.
       result.shift_right(1);
       result.mantissa.val[DyadicFloat<Bits>::MantissaType::WORD_COUNT - 1] |=
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 5dcae518bb6f..05506c04fc07 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -198,12 +198,9 @@ template <bool Invert> struct Process {
   /// convergent, otherwise the compiler will sink the store and deadlock.
   [[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
                                                 uint32_t index) {
-    // Do not move any writes past the unlock
+    // Do not move any writes past the unlock.
     atomic_thread_fence(cpp::MemoryOrder::RELEASE);
 
-    // Wait for other threads in the warp to finish using the lock
-    gpu::sync_lane(lane_mask);
-
     // Use exactly one thread to clear the nth bit in the lock array Must
     // restrict to a single thread to avoid one thread dropping the lock, then
     // an unrelated warp claiming the lock, then a second thread in this warp
@@ -331,6 +328,9 @@ public:
   LIBC_INLINE uint16_t get_index() const { return index; }
 
   LIBC_INLINE void close() {
+    // Wait for all lanes to finish using the port.
+    gpu::sync_lane(lane_mask);
+
     // The server is passive, if it own the buffer when it closes we need to
     // give ownership back to the client.
     if (owns_buffer && T)
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index 282efdba1c5f..c1e55ceef211 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -14,10 +14,11 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/macros/attributes.h"   // LIBC_INLINE
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/attributes.h"          // LIBC_INLINE
+#include "src/__support/macros/optimization.h"        // LIBC_UNLIKELY
+#include "src/__support/macros/properties/compiler.h" // LIBC_COMPILER_IS_CLANG
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128, LIBC_TYPES_HAS_INT64
-#include "src/__support/math_extras.h" // SumCarry, DiffBorrow
+#include "src/__support/math_extras.h" // add_with_carry, sub_with_borrow
 #include "src/__support/number_pair.h"
 
 #include <stddef.h> // For size_t
@@ -25,71 +26,324 @@
 
 namespace LIBC_NAMESPACE {
 
-namespace internal {
-template <typename T> struct half_width;
+namespace multiword {
 
-template <> struct half_width<uint64_t> : cpp::type_identity<uint32_t> {};
-template <> struct half_width<uint32_t> : cpp::type_identity<uint16_t> {};
+// A type trait mapping unsigned integers to their half-width unsigned
+// counterparts.
+template <typename T> struct half_width;
 template <> struct half_width<uint16_t> : cpp::type_identity<uint8_t> {};
+template <> struct half_width<uint32_t> : cpp::type_identity<uint16_t> {};
+#ifdef LIBC_TYPES_HAS_INT64
+template <> struct half_width<uint64_t> : cpp::type_identity<uint32_t> {};
 #ifdef LIBC_TYPES_HAS_INT128
 template <> struct half_width<__uint128_t> : cpp::type_identity<uint64_t> {};
 #endif // LIBC_TYPES_HAS_INT128
-
+#endif // LIBC_TYPES_HAS_INT64
 template <typename T> using half_width_t = typename half_width<T>::type;
 
-template <typename T> constexpr NumberPair<T> full_mul(T a, T b) {
-  NumberPair<T> pa = split(a);
-  NumberPair<T> pb = split(b);
-  NumberPair<T> prod;
+// An array of two elements that can be used in multiword operations.
+template <typename T> struct DoubleWide final : cpp::array<T, 2> {
+  using UP = cpp::array<T, 2>;
+  using UP::UP;
+  LIBC_INLINE constexpr DoubleWide(T lo, T hi) : UP({lo, hi}) {}
+};
+
+// Converts an unsigned value into a DoubleWide<half_width_t<T>>.
+template <typename T> LIBC_INLINE constexpr auto split(T value) {
+  static_assert(cpp::is_unsigned_v<T>);
+  using half_type = half_width_t<T>;
+  return DoubleWide<half_type>(
+      half_type(value),
+      half_type(value >> cpp::numeric_limits<half_type>::digits));
+}
+
+// The low part of a DoubleWide value.
+template <typename T> LIBC_INLINE constexpr T lo(const DoubleWide<T> &value) {
+  return value[0];
+}
+// The high part of a DoubleWide value.
+template <typename T> LIBC_INLINE constexpr T hi(const DoubleWide<T> &value) {
+  return value[1];
+}
+// The low part of an unsigned value.
+template <typename T> LIBC_INLINE constexpr half_width_t<T> lo(T value) {
+  return lo(split(value));
+}
+// The high part of an unsigned value.
+template <typename T> LIBC_INLINE constexpr half_width_t<T> hi(T value) {
+  return hi(split(value));
+}
+
+// Returns 'a' times 'b' in a DoubleWide<word>. Cannot overflow by construction.
+template <typename word>
+LIBC_INLINE constexpr DoubleWide<word> mul2(word a, word b) {
+  if constexpr (cpp::is_same_v<word, uint8_t>) {
+    return split<uint16_t>(uint16_t(a) * uint16_t(b));
+  } else if constexpr (cpp::is_same_v<word, uint16_t>) {
+    return split<uint32_t>(uint32_t(a) * uint32_t(b));
+  }
+#ifdef LIBC_TYPES_HAS_INT64
+  else if constexpr (cpp::is_same_v<word, uint32_t>) {
+    return split<uint64_t>(uint64_t(a) * uint64_t(b));
+  }
+#endif
+#ifdef LIBC_TYPES_HAS_INT128
+  else if constexpr (cpp::is_same_v<word, uint64_t>) {
+    return split<__uint128_t>(__uint128_t(a) * __uint128_t(b));
+  }
+#endif
+  else {
+    using half_word = half_width_t<word>;
+    const auto shiftl = [](word value) -> word {
+      return value << cpp::numeric_limits<half_word>::digits;
+    };
+    const auto shiftr = [](word value) -> word {
+      return value >> cpp::numeric_limits<half_word>::digits;
+    };
+    // Here we do a one digit multiplication where 'a' and 'b' are of type
+    // word. We split 'a' and 'b' into half words and perform the classic long
+    // multiplication with 'a' and 'b' being two-digit numbers.
+
+    //    a      a_hi a_lo
+    //  x b => x b_hi b_lo
+    // ----    -----------
+    //    c         result
+    // We convert 'lo' and 'hi' from 'half_word' to 'word' so multiplication
+    // doesn't overflow.
+    const word a_lo = lo(a);
+    const word b_lo = lo(b);
+    const word a_hi = hi(a);
+    const word b_hi = hi(b);
+    const word step1 = b_lo * a_lo; // no overflow;
+    const word step2 = b_lo * a_hi; // no overflow;
+    const word step3 = b_hi * a_lo; // no overflow;
+    const word step4 = b_hi * a_hi; // no overflow;
+    word lo_digit = step1;
+    word hi_digit = step4;
+    const word no_carry = 0;
+    word carry;
+    word _; // unused carry variable.
+    lo_digit = add_with_carry<word>(lo_digit, shiftl(step2), no_carry, carry);
+    hi_digit = add_with_carry<word>(hi_digit, shiftr(step2), carry, _);
+    lo_digit = add_with_carry<word>(lo_digit, shiftl(step3), no_carry, carry);
+    hi_digit = add_with_carry<word>(hi_digit, shiftr(step3), carry, _);
+    return DoubleWide<word>(lo_digit, hi_digit);
+  }
+}
+
+// In-place 'dst op= rhs' with operation with carry propagation. Returns carry.
+template <typename Function, typename word, size_t N, size_t M>
+LIBC_INLINE constexpr word inplace_binop(Function op_with_carry,
+                                         cpp::array<word, N> &dst,
+                                         const cpp::array<word, M> &rhs) {
+  static_assert(N >= M);
+  word carry_out = 0;
+  for (size_t i = 0; i < N; ++i) {
+    const bool has_rhs_value = i < M;
+    const word rhs_value = has_rhs_value ? rhs[i] : 0;
+    const word carry_in = carry_out;
+    dst[i] = op_with_carry(dst[i], rhs_value, carry_in, carry_out);
+    // stop early when rhs is over and no carry is to be propagated.
+    if (!has_rhs_value && carry_out == 0)
+      break;
+  }
+  return carry_out;
+}
 
-  prod.lo = pa.lo * pb.lo;                    // exact
-  prod.hi = pa.hi * pb.hi;                    // exact
-  NumberPair<T> lo_hi = split(pa.lo * pb.hi); // exact
-  NumberPair<T> hi_lo = split(pa.hi * pb.lo); // exact
+// In-place addition. Returns carry.
+template <typename word, size_t N, size_t M>
+LIBC_INLINE constexpr word add_with_carry(cpp::array<word, N> &dst,
+                                          const cpp::array<word, M> &rhs) {
+  return inplace_binop(LIBC_NAMESPACE::add_with_carry<word>, dst, rhs);
+}
+
+// In-place subtraction. Returns borrow.
+template <typename word, size_t N, size_t M>
+LIBC_INLINE constexpr word sub_with_borrow(cpp::array<word, N> &dst,
+                                           const cpp::array<word, M> &rhs) {
+  return inplace_binop(LIBC_NAMESPACE::sub_with_borrow<word>, dst, rhs);
+}
+
+// In-place multiply-add. Returns carry.
+// i.e., 'dst += b * c'
+template <typename word, size_t N>
+LIBC_INLINE constexpr word mul_add_with_carry(cpp::array<word, N> &dst, word b,
+                                              word c) {
+  return add_with_carry(dst, mul2(b, c));
+}
 
-  constexpr size_t HALF_BIT_WIDTH = sizeof(T) * CHAR_BIT / 2;
+// An array of two elements serving as an accumulator during multiword
+// computations.
+template <typename T> struct Accumulator final : cpp::array<T, 2> {
+  using UP = cpp::array<T, 2>;
+  LIBC_INLINE constexpr Accumulator() : UP({0, 0}) {}
+  LIBC_INLINE constexpr T advance(T carry_in) {
+    auto result = UP::front();
+    UP::front() = UP::back();
+    UP::back() = carry_in;
+    return result;
+  }
+  LIBC_INLINE constexpr T sum() const { return UP::front(); }
+  LIBC_INLINE constexpr T carry() const { return UP::back(); }
+};
 
-  auto r1 = add_with_carry(prod.lo, lo_hi.lo << HALF_BIT_WIDTH, T(0));
-  prod.lo = r1.sum;
-  prod.hi = add_with_carry(prod.hi, lo_hi.hi, r1.carry).sum;
+// In-place multiplication by a single word. Returns carry.
+template <typename word, size_t N>
+LIBC_INLINE constexpr word scalar_multiply_with_carry(cpp::array<word, N> &dst,
+                                                      word x) {
+  Accumulator<word> acc;
+  for (auto &val : dst) {
+    const word carry = mul_add_with_carry(acc, val, x);
+    val = acc.advance(carry);
+  }
+  return acc.carry();
+}
 
-  auto r2 = add_with_carry(prod.lo, hi_lo.lo << HALF_BIT_WIDTH, T(0));
-  prod.lo = r2.sum;
-  prod.hi = add_with_carry(prod.hi, hi_lo.hi, r2.carry).sum;
+// Multiplication of 'lhs' by 'rhs' into 'dst'. Returns carry.
+// This function is safe to use for signed numbers.
+// https://stackoverflow.com/a/20793834
+// https://pages.cs.wisc.edu/%7Emarkhill/cs354/Fall2008/beyond354/int.mult.html
+template <typename word, size_t O, size_t M, size_t N>
+LIBC_INLINE constexpr word multiply_with_carry(cpp::array<word, O> &dst,
+                                               const cpp::array<word, M> &lhs,
+                                               const cpp::array<word, N> &rhs) {
+  static_assert(O >= M + N);
+  Accumulator<word> acc;
+  for (size_t i = 0; i < O; ++i) {
+    const size_t lower_idx = i < N ? 0 : i - N + 1;
+    const size_t upper_idx = i < M ? i : M - 1;
+    word carry = 0;
+    for (size_t j = lower_idx; j <= upper_idx; ++j)
+      carry += mul_add_with_carry(acc, lhs[j], rhs[i - j]);
+    dst[i] = acc.advance(carry);
+  }
+  return acc.carry();
+}
 
-  return prod;
+template <typename word, size_t N>
+LIBC_INLINE constexpr void quick_mul_hi(cpp::array<word, N> &dst,
+                                        const cpp::array<word, N> &lhs,
+                                        const cpp::array<word, N> &rhs) {
+  Accumulator<word> acc;
+  word carry = 0;
+  // First round of accumulation for those at N - 1 in the full product.
+  for (size_t i = 0; i < N; ++i)
+    carry += mul_add_with_carry(acc, lhs[i], rhs[N - 1 - i]);
+  for (size_t i = N; i < 2 * N - 1; ++i) {
+    acc.advance(carry);
+    carry = 0;
+    for (size_t j = i - N + 1; j < N; ++j)
+      carry += mul_add_with_carry(acc, lhs[j], rhs[i - j]);
+    dst[i - N] = acc.sum();
+  }
+  dst.back() = acc.carry();
 }
 
-template <>
-LIBC_INLINE constexpr NumberPair<uint32_t> full_mul<uint32_t>(uint32_t a,
-                                                              uint32_t b) {
-  uint64_t prod = uint64_t(a) * uint64_t(b);
-  NumberPair<uint32_t> result;
-  result.lo = uint32_t(prod);
-  result.hi = uint32_t(prod >> 32);
-  return result;
+template <typename word, size_t N>
+LIBC_INLINE constexpr bool is_negative(cpp::array<word, N> &array) {
+  using signed_word = cpp::make_signed_t<word>;
+  return cpp::bit_cast<signed_word>(array.back()) < 0;
 }
 
+// An enum for the shift function below.
+enum Direction { LEFT, RIGHT };
+
+// A bitwise shift on an array of elements.
+// TODO: Make the result UB when 'offset' is greater or equal to the number of
+// bits in 'array'. This will allow for better code performance.
+template <Direction direction, bool is_signed, typename word, size_t N>
+LIBC_INLINE constexpr cpp::array<word, N> shift(cpp::array<word, N> array,
+                                                size_t offset) {
+  static_assert(direction == LEFT || direction == RIGHT);
+  constexpr size_t WORD_BITS = cpp::numeric_limits<word>::digits;
+  constexpr size_t TOTAL_BITS = N * WORD_BITS;
+  if (LIBC_UNLIKELY(offset == 0))
+    return array;
+  if (LIBC_UNLIKELY(offset >= TOTAL_BITS))
+    return {};
 #ifdef LIBC_TYPES_HAS_INT128
-template <>
-LIBC_INLINE constexpr NumberPair<uint64_t> full_mul<uint64_t>(uint64_t a,
-                                                              uint64_t b) {
-  __uint128_t prod = __uint128_t(a) * __uint128_t(b);
-  NumberPair<uint64_t> result;
-  result.lo = uint64_t(prod);
-  result.hi = uint64_t(prod >> 64);
-  return result;
+  if constexpr (TOTAL_BITS == 128) {
+    using type = cpp::conditional_t<is_signed, __int128_t, __uint128_t>;
+    auto tmp = cpp::bit_cast<type>(array);
+    if constexpr (direction == LEFT)
+      tmp <<= offset;
+    else
+      tmp >>= offset;
+    return cpp::bit_cast<cpp::array<word, N>>(tmp);
+  }
+#endif
+  const bool is_neg = is_signed && is_negative(array);
+  constexpr auto at = [](size_t index) -> int {
+    // reverse iteration when direction == LEFT.
+    if constexpr (direction == LEFT)
+      return int(N) - int(index) - 1;
+    return int(index);
+  };
+  const auto safe_get_at = [&](size_t index) -> word {
+    // return appropriate value when accessing out of bound elements.
+    const int i = at(index);
+    if (i < 0)
+      return 0;
+    if (i >= int(N))
+      return is_neg ? -1 : 0;
+    return array[i];
+  };
+  const size_t index_offset = offset / WORD_BITS;
+  const size_t bit_offset = offset % WORD_BITS;
+#ifdef LIBC_COMPILER_IS_CLANG
+  __builtin_assume(index_offset < N);
+#endif
+  cpp::array<word, N> out = {};
+  for (size_t index = 0; index < N; ++index) {
+    const word part1 = safe_get_at(index + index_offset);
+    const word part2 = safe_get_at(index + index_offset + 1);
+    word &dst = out[at(index)];
+    if (bit_offset == 0)
+      dst = part1; // no crosstalk between parts.
+    else if constexpr (direction == LEFT)
+      dst = (part1 << bit_offset) | (part2 >> (WORD_BITS - bit_offset));
+    else
+      dst = (part1 >> bit_offset) | (part2 << (WORD_BITS - bit_offset));
+  }
+  return out;
 }
-#endif // LIBC_TYPES_HAS_INT128
 
-} // namespace internal
+#define DECLARE_COUNTBIT(NAME, INDEX_EXPR)                                     \
+  template <typename word, size_t N>                                           \
+  LIBC_INLINE constexpr int NAME(const cpp::array<word, N> &val) {             \
+    int bit_count = 0;                                                         \
+    for (size_t i = 0; i < N; ++i) {                                           \
+      const int word_count = cpp::NAME<word>(val[INDEX_EXPR]);                 \
+      bit_count += word_count;                                                 \
+      if (word_count != cpp::numeric_limits<word>::digits)                     \
+        break;                                                                 \
+    }                                                                          \
+    return bit_count;                                                          \
+  }
+
+DECLARE_COUNTBIT(countr_zero, i)         // iterating forward
+DECLARE_COUNTBIT(countr_one, i)          // iterating forward
+DECLARE_COUNTBIT(countl_zero, N - i - 1) // iterating backward
+DECLARE_COUNTBIT(countl_one, N - i - 1)  // iterating backward
+
+} // namespace multiword
 
 template <size_t Bits, bool Signed, typename WordType = uint64_t>
 struct BigInt {
+private:
   static_assert(cpp::is_integral_v<WordType> && cpp::is_unsigned_v<WordType>,
                 "WordType must be unsigned integer.");
 
+  struct Division {
+    BigInt quotient;
+    BigInt remainder;
+  };
+
+public:
   using word_type = WordType;
+  using unsigned_type = BigInt<Bits, false, word_type>;
+  using signed_type = BigInt<Bits, true, word_type>;
+
   LIBC_INLINE_VAR static constexpr bool SIGNED = Signed;
   LIBC_INLINE_VAR static constexpr size_t BITS = Bits;
   LIBC_INLINE_VAR
@@ -100,10 +354,7 @@ struct BigInt {
 
   LIBC_INLINE_VAR static constexpr size_t WORD_COUNT = Bits / WORD_SIZE;
 
-  using unsigned_type = BigInt<BITS, false, word_type>;
-  using signed_type = BigInt<BITS, true, word_type>;
-
-  cpp::array<WordType, WORD_COUNT> val{};
+  cpp::array<WordType, WORD_COUNT> val{}; // zero initialized.
 
   LIBC_INLINE constexpr BigInt() = default;
 
@@ -112,76 +363,67 @@ struct BigInt {
   template <size_t OtherBits, bool OtherSigned>
   LIBC_INLINE constexpr BigInt(
       const BigInt<OtherBits, OtherSigned, WordType> &other) {
-    if (OtherBits >= Bits) {
+    if (OtherBits >= Bits) { // truncate
       for (size_t i = 0; i < WORD_COUNT; ++i)
         val[i] = other[i];
-    } else {
+    } else { // zero or sign extend
       size_t i = 0;
       for (; i < OtherBits / WORD_SIZE; ++i)
         val[i] = other[i];
-      WordType sign = 0;
-      if constexpr (Signed && OtherSigned) {
-        sign = static_cast<WordType>(
-            -static_cast<cpp::make_signed_t<WordType>>(other.is_neg()));
-      }
-      for (; i < WORD_COUNT; ++i)
-        val[i] = sign;
+      extend(i, Signed && other.is_neg());
     }
   }
 
   // Construct a BigInt from a C array.
-  template <size_t N, cpp::enable_if_t<N <= WORD_COUNT, int> = 0>
-  LIBC_INLINE constexpr BigInt(const WordType (&nums)[N]) {
-    size_t min_wordcount = N < WORD_COUNT ? N : WORD_COUNT;
-    size_t i = 0;
-    for (; i < min_wordcount; ++i)
+  template <size_t N> LIBC_INLINE constexpr BigInt(const WordType (&nums)[N]) {
+    static_assert(N == WORD_COUNT);
+    for (size_t i = 0; i < WORD_COUNT; ++i)
       val[i] = nums[i];
+  }
 
-    // If nums doesn't completely fill val, then fill the rest with zeroes.
-    for (; i < WORD_COUNT; ++i)
-      val[i] = 0;
+  LIBC_INLINE constexpr explicit BigInt(
+      const cpp::array<WordType, WORD_COUNT> &words) {
+    val = words;
   }
 
   // Initialize the first word to |v| and the rest to 0.
   template <typename T, typename = cpp::enable_if_t<cpp::is_integral_v<T>>>
   LIBC_INLINE constexpr BigInt(T v) {
-    val[0] = static_cast<WordType>(v);
-
-    if constexpr (WORD_COUNT == 1)
-      return;
-
-    if constexpr (Bits < sizeof(T) * CHAR_BIT) {
-      for (int i = 1; i < WORD_COUNT; ++i) {
-        v >>= WORD_SIZE;
-        val[i] = static_cast<WordType>(v);
+    constexpr size_t T_SIZE = sizeof(T) * CHAR_BIT;
+    const bool is_neg = Signed && (v < 0);
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      if (v == 0) {
+        extend(i, is_neg);
+        return;
       }
-      return;
-    }
-
-    size_t i = 1;
-
-    if constexpr (WORD_SIZE < sizeof(T) * CHAR_BIT)
-      for (; i < sizeof(T) * CHAR_BIT / WORD_SIZE; ++i) {
+      val[i] = static_cast<WordType>(v);
+      if constexpr (T_SIZE > WORD_SIZE)
         v >>= WORD_SIZE;
-        val[i] = static_cast<WordType>(v);
-      }
-
-    WordType sign = (Signed && (v < 0)) ? ~WordType(0) : WordType(0);
-    for (; i < WORD_COUNT; ++i) {
-      val[i] = sign;
+      else
+        v = 0;
     }
   }
+  LIBC_INLINE constexpr BigInt &operator=(const BigInt &other) = default;
 
-  LIBC_INLINE constexpr explicit BigInt(
-      const cpp::array<WordType, WORD_COUNT> &words) {
-    for (size_t i = 0; i < WORD_COUNT; ++i)
-      val[i] = words[i];
+  // constants
+  LIBC_INLINE static constexpr BigInt zero() { return BigInt(); }
+  LIBC_INLINE static constexpr BigInt one() { return BigInt(1); }
+  LIBC_INLINE static constexpr BigInt all_ones() { return ~zero(); }
+  LIBC_INLINE static constexpr BigInt min() {
+    BigInt out;
+    if constexpr (SIGNED)
+      out.set_msb();
+    return out;
+  }
+  LIBC_INLINE static constexpr BigInt max() {
+    BigInt out = all_ones();
+    if constexpr (SIGNED)
+      out.clear_msb();
+    return out;
   }
 
   // TODO: Reuse the Sign type.
-  LIBC_INLINE constexpr bool is_neg() const {
-    return val.back() >> (WORD_SIZE - 1);
-  }
+  LIBC_INLINE constexpr bool is_neg() const { return SIGNED && get_msb(); }
 
   template <typename T> LIBC_INLINE constexpr explicit operator T() const {
     return to<T>();
@@ -191,200 +433,100 @@ struct BigInt {
   LIBC_INLINE constexpr cpp::enable_if_t<
       cpp::is_integral_v<T> && !cpp::is_same_v<T, bool>, T>
   to() const {
+    constexpr size_t T_SIZE = sizeof(T) * CHAR_BIT;
     T lo = static_cast<T>(val[0]);
-
-    constexpr size_t T_BITS = sizeof(T) * CHAR_BIT;
-
-    if constexpr (T_BITS <= WORD_SIZE)
+    if constexpr (T_SIZE <= WORD_SIZE)
       return lo;
-
     constexpr size_t MAX_COUNT =
-        T_BITS > Bits ? WORD_COUNT : T_BITS / WORD_SIZE;
+        T_SIZE > Bits ? WORD_COUNT : T_SIZE / WORD_SIZE;
     for (size_t i = 1; i < MAX_COUNT; ++i)
       lo += static_cast<T>(val[i]) << (WORD_SIZE * i);
-
-    if constexpr (Signed && (T_BITS > Bits)) {
+    if constexpr (Signed && (T_SIZE > Bits)) {
       // Extend sign for negative numbers.
       constexpr T MASK = (~T(0) << Bits);
       if (is_neg())
         lo |= MASK;
     }
-
     return lo;
   }
 
   LIBC_INLINE constexpr explicit operator bool() const { return !is_zero(); }
 
-  LIBC_INLINE constexpr BigInt &operator=(const BigInt &other) = default;
-
   LIBC_INLINE constexpr bool is_zero() const {
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      if (val[i] != 0)
+    for (auto part : val)
+      if (part != 0)
         return false;
-    }
     return true;
   }
 
-  // Add x to this number and store the result in this number.
+  // Add 'rhs' to this number and store the result in this number.
   // Returns the carry value produced by the addition operation.
-  LIBC_INLINE constexpr WordType add(const BigInt &x) {
-    SumCarry<WordType> s{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      s = add_with_carry(val[i], x.val[i], s.carry);
-      val[i] = s.sum;
-    }
-    return s.carry;
+  LIBC_INLINE constexpr WordType add_overflow(const BigInt &rhs) {
+    return multiword::add_with_carry(val, rhs.val);
   }
 
   LIBC_INLINE constexpr BigInt operator+(const BigInt &other) const {
-    BigInt result;
-    SumCarry<WordType> s{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      s = add_with_carry(val[i], other.val[i], s.carry);
-      result.val[i] = s.sum;
-    }
+    BigInt result = *this;
+    result.add_overflow(other);
     return result;
   }
 
   // This will only apply when initializing a variable from constant values, so
   // it will always use the constexpr version of add_with_carry.
   LIBC_INLINE constexpr BigInt operator+(BigInt &&other) const {
-    BigInt result;
-    SumCarry<WordType> s{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      s = add_with_carry(val[i], other.val[i], s.carry);
-      result.val[i] = s.sum;
-    }
-    return result;
+    // We use addition commutativity to reuse 'other' and prevent allocation.
+    other.add_overflow(*this); // Returned carry value is ignored.
+    return other;
   }
 
   LIBC_INLINE constexpr BigInt &operator+=(const BigInt &other) {
-    add(other); // Returned carry value is ignored.
+    add_overflow(other); // Returned carry value is ignored.
     return *this;
   }
 
-  // Subtract x to this number and store the result in this number.
+  // Subtract 'rhs' to this number and store the result in this number.
   // Returns the carry value produced by the subtraction operation.
-  LIBC_INLINE constexpr WordType sub(const BigInt &x) {
-    DiffBorrow<WordType> d{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      d = sub_with_borrow(val[i], x.val[i], d.borrow);
-      val[i] = d.diff;
-    }
-    return d.borrow;
+  LIBC_INLINE constexpr WordType sub_overflow(const BigInt &rhs) {
+    return multiword::sub_with_borrow(val, rhs.val);
   }
 
   LIBC_INLINE constexpr BigInt operator-(const BigInt &other) const {
-    BigInt result;
-    DiffBorrow<WordType> d{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      d = sub_with_borrow(val[i], other.val[i], d.borrow);
-      result.val[i] = d.diff;
-    }
+    BigInt result = *this;
+    result.sub_overflow(other); // Returned carry value is ignored.
     return result;
   }
 
   LIBC_INLINE constexpr BigInt operator-(BigInt &&other) const {
-    BigInt result;
-    DiffBorrow<WordType> d{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      d = sub_with_borrow(val[i], other.val[i], d.borrow);
-      result.val[i] = d.diff;
-    }
+    BigInt result = *this;
+    result.sub_overflow(other); // Returned carry value is ignored.
     return result;
   }
 
   LIBC_INLINE constexpr BigInt &operator-=(const BigInt &other) {
     // TODO(lntue): Set overflow flag / errno when carry is true.
-    sub(other);
+    sub_overflow(other); // Returned carry value is ignored.
     return *this;
   }
 
-  // Multiply this number with x and store the result in this number. It is
-  // implemented using the long multiplication algorithm by splitting the
-  // 64-bit words of this number and |x| in to 32-bit halves but peforming
-  // the operations using 64-bit numbers. This ensures that we don't lose the
-  // carry bits.
-  // Returns the carry value produced by the multiplication operation.
+  // Multiply this number with x and store the result in this number.
   LIBC_INLINE constexpr WordType mul(WordType x) {
-    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      NumberPair<WordType> prod = internal::full_mul(val[i], x);
-      BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-      const WordType carry = partial_sum.add(tmp);
-      val[i] = partial_sum.val[0];
-      partial_sum.val[0] = partial_sum.val[1];
-      partial_sum.val[1] = carry;
-    }
-    return partial_sum.val[1];
-  }
-
-  LIBC_INLINE constexpr BigInt operator*(const BigInt &other) const {
-    if constexpr (Signed) {
-      BigInt<Bits, false, WordType> a(*this);
-      BigInt<Bits, false, WordType> b(other);
-      const bool a_neg = a.is_neg();
-      const bool b_neg = b.is_neg();
-      if (a_neg)
-        a = -a;
-      if (b_neg)
-        b = -b;
-      BigInt<Bits, false, WordType> prod = a * b;
-      if (a_neg != b_neg)
-        prod = -prod;
-      return static_cast<BigInt<Bits, true, WordType>>(prod);
-    } else {
-      if constexpr (WORD_COUNT == 1) {
-        return {val[0] * other.val[0]};
-      } else {
-        BigInt result(0);
-        BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-        WordType carry = 0;
-        for (size_t i = 0; i < WORD_COUNT; ++i) {
-          for (size_t j = 0; j <= i; j++) {
-            NumberPair<WordType> prod =
-                internal::full_mul(val[j], other.val[i - j]);
-            BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-            carry += partial_sum.add(tmp);
-          }
-          result.val[i] = partial_sum.val[0];
-          partial_sum.val[0] = partial_sum.val[1];
-          partial_sum.val[1] = carry;
-          carry = 0;
-        }
-        return result;
-      }
-    }
+    return multiword::scalar_multiply_with_carry(val, x);
   }
 
-  // Return the full product, only unsigned for now.
+  // Return the full product.
   template <size_t OtherBits>
-  LIBC_INLINE constexpr BigInt<Bits + OtherBits, Signed, WordType>
+  LIBC_INLINE constexpr auto
   ful_mul(const BigInt<OtherBits, Signed, WordType> &other) const {
-    BigInt<Bits + OtherBits, Signed, WordType> result(0);
-    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-    WordType carry = 0;
-    constexpr size_t OTHER_WORDCOUNT =
-        BigInt<OtherBits, Signed, WordType>::WORD_COUNT;
-    for (size_t i = 0; i <= WORD_COUNT + OTHER_WORDCOUNT - 2; ++i) {
-      const size_t lower_idx =
-          i < OTHER_WORDCOUNT ? 0 : i - OTHER_WORDCOUNT + 1;
-      const size_t upper_idx = i < WORD_COUNT ? i : WORD_COUNT - 1;
-      for (size_t j = lower_idx; j <= upper_idx; ++j) {
-        NumberPair<WordType> prod =
-            internal::full_mul(val[j], other.val[i - j]);
-        BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-        carry += partial_sum.add(tmp);
-      }
-      result.val[i] = partial_sum.val[0];
-      partial_sum.val[0] = partial_sum.val[1];
-      partial_sum.val[1] = carry;
-      carry = 0;
-    }
-    result.val[WORD_COUNT + OTHER_WORDCOUNT - 1] = partial_sum.val[0];
+    BigInt<Bits + OtherBits, Signed, WordType> result;
+    multiword::multiply_with_carry(result.val, val, other.val);
     return result;
   }
 
+  LIBC_INLINE constexpr BigInt operator*(const BigInt &other) const {
+    // Perform full mul and truncate.
+    return BigInt(ful_mul(other));
+  }
+
   // Fast hi part of the full product.  The normal product `operator*` returns
   // `Bits` least significant bits of the full product, while this function will
   // approximate `Bits` most significant bits of the full product with errors
@@ -407,39 +549,17 @@ struct BigInt {
   //    256      4        16          10            3
   //    512      8        64          36            7
   LIBC_INLINE constexpr BigInt quick_mul_hi(const BigInt &other) const {
-    BigInt result(0);
-    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-    WordType carry = 0;
-    // First round of accumulation for those at WORD_COUNT - 1 in the full
-    // product.
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      NumberPair<WordType> prod =
-          internal::full_mul(val[i], other.val[WORD_COUNT - 1 - i]);
-      BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-      carry += partial_sum.add(tmp);
-    }
-    for (size_t i = WORD_COUNT; i < 2 * WORD_COUNT - 1; ++i) {
-      partial_sum.val[0] = partial_sum.val[1];
-      partial_sum.val[1] = carry;
-      carry = 0;
-      for (size_t j = i - WORD_COUNT + 1; j < WORD_COUNT; ++j) {
-        NumberPair<WordType> prod =
-            internal::full_mul(val[j], other.val[i - j]);
-        BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-        carry += partial_sum.add(tmp);
-      }
-      result.val[i - WORD_COUNT] = partial_sum.val[0];
-    }
-    result.val[WORD_COUNT - 1] = partial_sum.val[1];
+    BigInt result;
+    multiword::quick_mul_hi(result.val, val, other.val);
     return result;
   }
 
-  // pow takes a power and sets this to its starting value to that power. Zero
-  // to the zeroth power returns 1.
+  // BigInt(x).pow_n(n) computes x ^ n.
+  // Note 0 ^ 0 == 1.
   LIBC_INLINE constexpr void pow_n(uint64_t power) {
-    BigInt result = 1;
+    static_assert(!Signed);
+    BigInt result = one();
     BigInt cur_power = *this;
-
     while (power > 0) {
       if ((power % 2) > 0)
         result *= cur_power;
@@ -449,38 +569,23 @@ struct BigInt {
     *this = result;
   }
 
-  // TODO: Make division work correctly for signed integers.
-
-  // div takes another BigInt of the same size and divides this by it. The value
-  // of this will be set to the quotient, and the return value is the remainder.
-  LIBC_INLINE constexpr cpp::optional<BigInt> div(const BigInt &other) {
-    BigInt remainder(0);
-    if (*this < other) {
-      remainder = *this;
-      *this = BigInt(0);
-      return remainder;
-    }
-    if (other == 1) {
-      return remainder;
-    }
-    if (other == 0) {
+  // Performs inplace signed / unsigned division. Returns remainder if not
+  // dividing by zero.
+  // For signed numbers it behaves like C++ signed integer division.
+  // That is by truncating the fractionnal part
+  // https://stackoverflow.com/a/3602857
+  LIBC_INLINE constexpr cpp::optional<BigInt> div(const BigInt &divider) {
+    if (LIBC_UNLIKELY(divider.is_zero()))
       return cpp::nullopt;
-    }
-
-    BigInt quotient(0);
-    BigInt subtractor = other;
-    int cur_bit = static_cast<int>(subtractor.clz() - this->clz());
-    subtractor.shift_left(cur_bit);
-
-    for (; cur_bit >= 0 && *this > 0; --cur_bit, subtractor.shift_right(1)) {
-      if (*this >= subtractor) {
-        this->sub(subtractor);
-        quotient = quotient | (BigInt(1) << cur_bit);
-      }
-    }
-    remainder = *this;
-    *this = quotient;
-    return remainder;
+    if (LIBC_UNLIKELY(divider == BigInt::one()))
+      return BigInt::zero();
+    Division result;
+    if constexpr (SIGNED)
+      result = divide_signed(*this, divider);
+    else
+      result = divide_unsigned(*this, divider);
+    *this = result.quotient;
+    return result.remainder;
   }
 
   // Efficiently perform BigInt / (x * 2^e), where x is a half-word-size
@@ -496,19 +601,16 @@ struct BigInt {
   // computation of each step is now properly contained within WordType.
   //   And finally we perform some extra alignment steps for the remaining bits.
   LIBC_INLINE constexpr cpp::optional<BigInt>
-  div_uint_half_times_pow_2(internal::half_width_t<WordType> x, size_t e) {
-    BigInt remainder(0);
-
-    if (x == 0) {
+  div_uint_half_times_pow_2(multiword::half_width_t<WordType> x, size_t e) {
+    BigInt remainder;
+    if (x == 0)
       return cpp::nullopt;
-    }
     if (e >= Bits) {
       remainder = *this;
-      *this = BigInt<Bits, false, WordType>(0);
+      *this = BigInt<Bits, false, WordType>();
       return remainder;
     }
-
-    BigInt quotient(0);
+    BigInt quotient;
     WordType x_word = static_cast<WordType>(x);
     constexpr size_t LOG2_WORD_SIZE = cpp::bit_width(WORD_SIZE) - 1;
     constexpr size_t HALF_WORD_SIZE = WORD_SIZE >> 1;
@@ -633,189 +735,22 @@ struct BigInt {
     return *this;
   }
 
-  // TODO: remove and use cpp::countl_zero below.
-  [[nodiscard]] LIBC_INLINE constexpr int clz() const {
-    constexpr int word_digits = cpp::numeric_limits<word_type>::digits;
-    int leading_zeroes = 0;
-    for (auto i = val.size(); i > 0;) {
-      --i;
-      const int zeroes = cpp::countl_zero(val[i]);
-      leading_zeroes += zeroes;
-      if (zeroes != word_digits)
-        break;
-    }
-    return leading_zeroes;
-  }
-
-  // TODO: remove and use cpp::countr_zero below.
-  [[nodiscard]] LIBC_INLINE constexpr int ctz() const {
-    constexpr int word_digits = cpp::numeric_limits<word_type>::digits;
-    int trailing_zeroes = 0;
-    for (auto word : val) {
-      const int zeroes = cpp::countr_zero(word);
-      trailing_zeroes += zeroes;
-      if (zeroes != word_digits)
-        break;
-    }
-    return trailing_zeroes;
-  }
-
-  LIBC_INLINE constexpr void shift_left(size_t s) {
-    if constexpr (Bits == WORD_SIZE) {
-      // Use native types if possible.
-      if (s >= WORD_SIZE) {
-        val[0] = 0;
-        return;
-      }
-      val[0] <<= s;
-      return;
-    }
-    if constexpr ((Bits == 64) && (WORD_SIZE == 32)) {
-      // Use builtin 64 bits for 32-bit base type if available;
-      if (s >= 64) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      uint64_t tmp = uint64__t(val[0]) + (uint64_t(val[1]) << 62);
-      tmp <<= s;
-      val[0] = uint32_t(tmp);
-      val[1] = uint32_t(tmp >> 32);
-      return;
-    }
-#ifdef LIBC_TYPES_HAS_INT128
-    if constexpr ((Bits == 128) && (WORD_SIZE == 64)) {
-      // Use builtin 128 bits if available;
-      if (s >= 128) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      __uint128_t tmp = __uint128_t(val[0]) + (__uint128_t(val[1]) << 64);
-      tmp <<= s;
-      val[0] = uint64_t(tmp);
-      val[1] = uint64_t(tmp >> 64);
-      return;
-    }
-#endif // LIBC_TYPES_HAS_INT128
-    if (LIBC_UNLIKELY(s == 0))
-      return;
-
-    const size_t drop = s / WORD_SIZE;  // Number of words to drop
-    const size_t shift = s % WORD_SIZE; // Bits to shift in the remaining words.
-    size_t i = WORD_COUNT;
-
-    if (drop < WORD_COUNT) {
-      i = WORD_COUNT - 1;
-      if (shift > 0) {
-        for (size_t j = WORD_COUNT - 1 - drop; j > 0; --i, --j) {
-          val[i] = (val[j] << shift) | (val[j - 1] >> (WORD_SIZE - shift));
-        }
-        val[i] = val[0] << shift;
-      } else {
-        for (size_t j = WORD_COUNT - 1 - drop; j > 0; --i, --j) {
-          val[i] = val[j];
-        }
-        val[i] = val[0];
-      }
-    }
-
-    for (size_t j = 0; j < i; ++j) {
-      val[j] = 0;
-    }
+  LIBC_INLINE constexpr BigInt &operator<<=(size_t s) {
+    val = multiword::shift<multiword::LEFT, SIGNED>(val, s);
+    return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator<<(size_t s) const {
-    BigInt result(*this);
-    result.shift_left(s);
-    return result;
+    return BigInt(multiword::shift<multiword::LEFT, SIGNED>(val, s));
   }
 
-  LIBC_INLINE constexpr BigInt &operator<<=(size_t s) {
-    shift_left(s);
+  LIBC_INLINE constexpr BigInt &operator>>=(size_t s) {
+    val = multiword::shift<multiword::RIGHT, SIGNED>(val, s);
     return *this;
   }
 
-  LIBC_INLINE constexpr void shift_right(size_t s) {
-    if constexpr ((Bits == 64) && (WORD_SIZE == 32)) {
-      // Use builtin 64 bits if available;
-      if (s >= 64) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      uint64_t tmp = uint64_t(val[0]) + (uint64_t(val[1]) << 32);
-      if constexpr (Signed) {
-        tmp = static_cast<uint64_t>(static_cast<int64_t>(tmp) >> s);
-      } else {
-        tmp >>= s;
-      }
-      val[0] = uint32_t(tmp);
-      val[1] = uint32_t(tmp >> 32);
-      return;
-    }
-#ifdef LIBC_TYPES_HAS_INT128
-    if constexpr ((Bits == 128) && (WORD_SIZE == 64)) {
-      // Use builtin 128 bits if available;
-      if (s >= 128) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      __uint128_t tmp = __uint128_t(val[0]) + (__uint128_t(val[1]) << 64);
-      if constexpr (Signed) {
-        tmp = static_cast<__uint128_t>(static_cast<__int128_t>(tmp) >> s);
-      } else {
-        tmp >>= s;
-      }
-      val[0] = uint64_t(tmp);
-      val[1] = uint64_t(tmp >> 64);
-      return;
-    }
-#endif // LIBC_TYPES_HAS_INT128
-
-    if (LIBC_UNLIKELY(s == 0))
-      return;
-    const size_t drop = s / WORD_SIZE;  // Number of words to drop
-    const size_t shift = s % WORD_SIZE; // Bit shift in the remaining words.
-
-    size_t i = 0;
-    WordType sign = Signed ? is_neg() : 0;
-
-    if (drop < WORD_COUNT) {
-      if (shift > 0) {
-        for (size_t j = drop; j < WORD_COUNT - 1; ++i, ++j) {
-          val[i] = (val[j] >> shift) | (val[j + 1] << (WORD_SIZE - shift));
-        }
-        if constexpr (Signed) {
-          val[i] = static_cast<WordType>(
-              static_cast<cpp::make_signed_t<WordType>>(val[WORD_COUNT - 1]) >>
-              shift);
-        } else {
-          val[i] = val[WORD_COUNT - 1] >> shift;
-        }
-        ++i;
-      } else {
-        for (size_t j = drop; j < WORD_COUNT; ++i, ++j) {
-          val[i] = val[j];
-        }
-      }
-    }
-
-    for (; i < WORD_COUNT; ++i) {
-      val[i] = sign;
-    }
-  }
-
   LIBC_INLINE constexpr BigInt operator>>(size_t s) const {
-    BigInt result(*this);
-    result.shift_right(s);
-    return result;
-  }
-
-  LIBC_INLINE constexpr BigInt &operator>>=(size_t s) {
-    shift_right(s);
-    return *this;
+    return BigInt(multiword::shift<multiword::RIGHT, SIGNED>(val, s));
   }
 
 #define DEFINE_BINOP(OP)                                                       \
@@ -833,10 +768,9 @@ struct BigInt {
     return lhs;                                                                \
   }
 
-  DEFINE_BINOP(&)
-  DEFINE_BINOP(|)
-  DEFINE_BINOP(^)
-
+  DEFINE_BINOP(&) // & and &=
+  DEFINE_BINOP(|) // | and |=
+  DEFINE_BINOP(^) // ^ and ^=
 #undef DEFINE_BINOP
 
   LIBC_INLINE constexpr BigInt operator~() const {
@@ -847,8 +781,8 @@ struct BigInt {
   }
 
   LIBC_INLINE constexpr BigInt operator-() const {
-    BigInt result = ~(*this);
-    result.add(BigInt(1));
+    BigInt result(*this);
+    result.negate();
     return result;
   }
 
@@ -865,24 +799,6 @@ struct BigInt {
     return !(lhs == rhs);
   }
 
-private:
-  LIBC_INLINE friend constexpr int cmp(const BigInt &lhs, const BigInt &rhs) {
-    const auto compare = [](WordType a, WordType b) {
-      return a == b ? 0 : a > b ? 1 : -1;
-    };
-    if constexpr (Signed) {
-      const bool lhs_is_neg = lhs.is_neg();
-      const bool rhs_is_neg = rhs.is_neg();
-      if (lhs_is_neg != rhs_is_neg)
-        return rhs_is_neg ? 1 : -1;
-    }
-    for (size_t i = WORD_COUNT; i-- > 0;)
-      if (auto cmp = compare(lhs[i], rhs[i]); cmp != 0)
-        return cmp;
-    return 0;
-  }
-
-public:
   LIBC_INLINE friend constexpr bool operator>(const BigInt &lhs,
                                               const BigInt &rhs) {
     return cmp(lhs, rhs) > 0;
@@ -901,24 +817,24 @@ public:
   }
 
   LIBC_INLINE constexpr BigInt &operator++() {
-    add(BigInt(1));
+    increment();
     return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator++(int) {
     BigInt oldval(*this);
-    add(BigInt(1));
+    increment();
     return oldval;
   }
 
   LIBC_INLINE constexpr BigInt &operator--() {
-    sub(BigInt(1));
+    decrement();
     return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator--(int) {
     BigInt oldval(*this);
-    sub(BigInt(1));
+    decrement();
     return oldval;
   }
 
@@ -930,9 +846,117 @@ public:
   // Return the i-th word of the number.
   LIBC_INLINE constexpr WordType &operator[](size_t i) { return val[i]; }
 
-  LIBC_INLINE WordType *data() { return val; }
+private:
+  LIBC_INLINE friend constexpr int cmp(const BigInt &lhs, const BigInt &rhs) {
+    constexpr auto compare = [](WordType a, WordType b) {
+      return a == b ? 0 : a > b ? 1 : -1;
+    };
+    if constexpr (Signed) {
+      const bool lhs_is_neg = lhs.is_neg();
+      const bool rhs_is_neg = rhs.is_neg();
+      if (lhs_is_neg != rhs_is_neg)
+        return rhs_is_neg ? 1 : -1;
+    }
+    for (size_t i = WORD_COUNT; i-- > 0;)
+      if (auto cmp = compare(lhs[i], rhs[i]); cmp != 0)
+        return cmp;
+    return 0;
+  }
+
+  LIBC_INLINE constexpr void bitwise_not() {
+    for (auto &part : val)
+      part = ~part;
+  }
+
+  LIBC_INLINE constexpr void negate() {
+    bitwise_not();
+    increment();
+  }
 
-  LIBC_INLINE const WordType *data() const { return val; }
+  LIBC_INLINE constexpr void increment() {
+    multiword::add_with_carry(val, cpp::array<WordType, 1>{1});
+  }
+
+  LIBC_INLINE constexpr void decrement() {
+    multiword::add_with_carry(val, cpp::array<WordType, 1>{1});
+  }
+
+  LIBC_INLINE constexpr void extend(size_t index, bool is_neg) {
+    const WordType value = is_neg ? cpp::numeric_limits<WordType>::max()
+                                  : cpp::numeric_limits<WordType>::min();
+    for (size_t i = index; i < WORD_COUNT; ++i)
+      val[i] = value;
+  }
+
+  LIBC_INLINE constexpr bool get_msb() const {
+    return val.back() >> (WORD_SIZE - 1);
+  }
+
+  LIBC_INLINE constexpr void set_msb() {
+    val.back() |= mask_leading_ones<WordType, 1>();
+  }
+
+  LIBC_INLINE constexpr void clear_msb() {
+    val.back() &= mask_trailing_ones<WordType, WORD_SIZE - 1>();
+  }
+
+  LIBC_INLINE constexpr void set_bit(size_t i) {
+    const size_t word_index = i / WORD_SIZE;
+    val[word_index] |= WordType(1) << (i % WORD_SIZE);
+  }
+
+  LIBC_INLINE constexpr static Division divide_unsigned(const BigInt &dividend,
+                                                        const BigInt &divider) {
+    BigInt remainder = dividend;
+    BigInt quotient;
+    if (remainder >= divider) {
+      BigInt subtractor = divider;
+      int cur_bit = multiword::countl_zero(subtractor.val) -
+                    multiword::countl_zero(remainder.val);
+      subtractor <<= cur_bit;
+      for (; cur_bit >= 0 && remainder > 0; --cur_bit, subtractor >>= 1) {
+        if (remainder < subtractor)
+          continue;
+        remainder -= subtractor;
+        quotient.set_bit(cur_bit);
+      }
+    }
+    return Division{quotient, remainder};
+  }
+
+  LIBC_INLINE constexpr static Division divide_signed(const BigInt &dividend,
+                                                      const BigInt &divider) {
+    // Special case because it is not possible to negate the min value of a
+    // signed integer.
+    if (dividend == min() && divider == min())
+      return Division{one(), zero()};
+    // 1. Convert the dividend and divisor to unsigned representation.
+    unsigned_type udividend(dividend);
+    unsigned_type udivider(divider);
+    // 2. Negate the dividend if it's negative, and similarly for the divisor.
+    const bool dividend_is_neg = dividend.is_neg();
+    const bool divider_is_neg = divider.is_neg();
+    if (dividend_is_neg)
+      udividend.negate();
+    if (divider_is_neg)
+      udivider.negate();
+    // 3. Use unsigned multiword division algorithm.
+    const auto unsigned_result = divide_unsigned(udividend, udivider);
+    // 4. Convert the quotient and remainder to signed representation.
+    Division result;
+    result.quotient = signed_type(unsigned_result.quotient);
+    result.remainder = signed_type(unsigned_result.remainder);
+    // 5. Negate the quotient if the dividend and divisor had opposite signs.
+    if (dividend_is_neg != divider_is_neg)
+      result.quotient.negate();
+    // 6. Negate the remainder if the dividend was negative.
+    if (dividend_is_neg)
+      result.remainder.negate();
+    return result;
+  }
+
+  friend signed_type;
+  friend unsigned_type;
 };
 
 namespace internal {
@@ -962,10 +986,8 @@ using Int = BigInt<Bits, true, internal::WordTypeSelectorT<Bits>>;
 // Provides limits of U/Int<128>.
 template <> class cpp::numeric_limits<UInt<128>> {
 public:
-  LIBC_INLINE static constexpr UInt<128> max() {
-    return UInt<128>({0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff});
-  }
-  LIBC_INLINE static constexpr UInt<128> min() { return UInt<128>(0); }
+  LIBC_INLINE static constexpr UInt<128> max() { return UInt<128>::max(); }
+  LIBC_INLINE static constexpr UInt<128> min() { return UInt<128>::min(); }
   // Meant to match std::numeric_limits interface.
   // NOLINTNEXTLINE(readability-identifier-naming)
   LIBC_INLINE_VAR static constexpr int digits = 128;
@@ -973,12 +995,8 @@ public:
 
 template <> class cpp::numeric_limits<Int<128>> {
 public:
-  LIBC_INLINE static constexpr Int<128> max() {
-    return Int<128>({0xffff'ffff'ffff'ffff, 0x7fff'ffff'ffff'ffff});
-  }
-  LIBC_INLINE static constexpr Int<128> min() {
-    return Int<128>({0, 0x8000'0000'0000'0000});
-  }
+  LIBC_INLINE static constexpr Int<128> max() { return Int<128>::max(); }
+  LIBC_INLINE static constexpr Int<128> min() { return Int<128>::min(); }
   // Meant to match std::numeric_limits interface.
   // NOLINTNEXTLINE(readability-identifier-naming)
   LIBC_INLINE_VAR static constexpr int digits = 128;
@@ -1112,30 +1130,28 @@ has_single_bit(T value) {
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countr_zero(const T &value) {
-  return value.ctz();
+  return multiword::countr_zero(value.val);
 }
 
 // Specialization of cpp::countl_zero ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countl_zero(const T &value) {
-  return value.clz();
+  return multiword::countl_zero(value.val);
 }
 
 // Specialization of cpp::countl_one ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countl_one(T value) {
-  // TODO : Implement a faster version not involving operator~.
-  return cpp::countl_zero<T>(~value);
+  return multiword::countl_one(value.val);
 }
 
 // Specialization of cpp::countr_one ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countr_one(T value) {
-  // TODO : Implement a faster version not involving operator~.
-  return cpp::countr_zero<T>(~value);
+  return multiword::countr_one(value.val);
 }
 
 // Specialization of cpp::bit_width ('bit.h') for BigInt.
@@ -1182,65 +1198,59 @@ rotr(T value, int rotate) {
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
 mask_trailing_ones() {
-  static_assert(!T::SIGNED);
-  if (count == 0)
-    return T();
-  constexpr unsigned T_BITS = CHAR_BIT * sizeof(T);
-  static_assert(count <= T_BITS && "Invalid bit index");
-  using word_type = typename T::word_type;
-  T out;
-  constexpr int CHUNK_INDEX_CONTAINING_BIT =
-      static_cast<int>(count / T::WORD_SIZE);
-  int index = 0;
-  for (auto &word : out.val) {
-    if (index < CHUNK_INDEX_CONTAINING_BIT)
-      word = -1;
-    else if (index > CHUNK_INDEX_CONTAINING_BIT)
-      word = 0;
-    else
-      word = mask_trailing_ones<word_type, count % T::WORD_SIZE>();
-    ++index;
-  }
+  static_assert(!T::SIGNED && count <= T::BITS);
+  if (count == T::BITS)
+    return T::all_ones();
+  constexpr size_t QUOTIENT = count / T::WORD_SIZE;
+  constexpr size_t REMAINDER = count % T::WORD_SIZE;
+  T out; // zero initialized
+  for (size_t i = 0; i <= QUOTIENT; ++i)
+    out[i] = i < QUOTIENT
+                 ? -1
+                 : mask_trailing_ones<typename T::word_type, REMAINDER>();
   return out;
 }
 
 // Specialization of mask_leading_ones ('math_extras.h') for BigInt.
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T> mask_leading_ones() {
-  static_assert(!T::SIGNED);
-  if (count == 0)
-    return T();
-  constexpr unsigned T_BITS = CHAR_BIT * sizeof(T);
-  static_assert(count <= T_BITS && "Invalid bit index");
-  using word_type = typename T::word_type;
-  T out;
-  constexpr int CHUNK_INDEX_CONTAINING_BIT =
-      static_cast<int>((T::BITS - count - 1ULL) / T::WORD_SIZE);
-  int index = 0;
-  for (auto &word : out.val) {
-    if (index < CHUNK_INDEX_CONTAINING_BIT)
-      word = 0;
-    else if (index > CHUNK_INDEX_CONTAINING_BIT)
-      word = -1;
-    else
-      word = mask_leading_ones<word_type, count % T::WORD_SIZE>();
-    ++index;
-  }
+  static_assert(!T::SIGNED && count <= T::BITS);
+  if (count == T::BITS)
+    return T::all_ones();
+  constexpr size_t QUOTIENT = (T::BITS - count - 1U) / T::WORD_SIZE;
+  constexpr size_t REMAINDER = count % T::WORD_SIZE;
+  T out; // zero initialized
+  for (size_t i = QUOTIENT; i < T::WORD_COUNT; ++i)
+    out[i] = i > QUOTIENT
+                 ? -1
+                 : mask_leading_ones<typename T::word_type, REMAINDER>();
   return out;
 }
 
+// Specialization of mask_trailing_zeros ('math_extras.h') for BigInt.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
+mask_trailing_zeros() {
+  return mask_leading_ones<T, T::BITS - count>();
+}
+
+// Specialization of mask_leading_zeros ('math_extras.h') for BigInt.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
+mask_leading_zeros() {
+  return mask_trailing_ones<T, T::BITS - count>();
+}
+
 // Specialization of count_zeros ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 count_zeros(T value) {
   return cpp::popcount(~value);
 }
 
 // Specialization of first_leading_zero ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_leading_zero(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countl_one(value) + 1;
@@ -1248,16 +1258,14 @@ first_leading_zero(T value) {
 
 // Specialization of first_leading_one ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_leading_one(T value) {
   return first_leading_zero(~value);
 }
 
 // Specialization of first_trailing_zero ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_trailing_zero(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countr_zero(~value) + 1;
@@ -1265,8 +1273,7 @@ first_trailing_zero(T value) {
 
 // Specialization of first_trailing_one ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_trailing_one(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countr_zero(value) + 1;
diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
index 1287c3e9a84f..4c59cfd99c2e 100644
--- a/libc/src/__support/float_to_string.h
+++ b/libc/src/__support/float_to_string.h
@@ -689,7 +689,7 @@ template <> class FloatToString<long double> {
 
       wide_int float_as_int = mantissa;
 
-      float_as_int.shift_left(exponent);
+      float_as_int <<= exponent;
       int_block_index = 0;
 
       while (float_as_int > 0) {
@@ -708,10 +708,11 @@ template <> class FloatToString<long double> {
 
       const int SHIFT_AMOUNT = FLOAT_AS_INT_WIDTH + exponent;
       static_assert(EXTRA_INT_WIDTH >= sizeof(long double) * 8);
-      float_as_fixed.shift_left(SHIFT_AMOUNT);
+      float_as_fixed <<= SHIFT_AMOUNT;
 
       // If there are still digits above the decimal point, handle those.
-      if (float_as_fixed.clz() < static_cast<int>(EXTRA_INT_WIDTH)) {
+      if (cpp::countl_zero(float_as_fixed) <
+          static_cast<int>(EXTRA_INT_WIDTH)) {
         UInt<EXTRA_INT_WIDTH> above_decimal_point =
             float_as_fixed >> FLOAT_AS_INT_WIDTH;
 
diff --git a/libc/src/__support/integer_literals.h b/libc/src/__support/integer_literals.h
index de1f88fbd3f3..e99799c3512e 100644
--- a/libc/src/__support/integer_literals.h
+++ b/libc/src/__support/integer_literals.h
@@ -151,12 +151,15 @@ template <size_t N> struct Parser<LIBC_NAMESPACE::UInt<N>> {
 template <typename T>
 LIBC_INLINE constexpr T parse_with_prefix(const char *ptr) {
   using P = Parser<T>;
-  if (ptr[0] == '0' && ptr[1] == 'x')
-    return P::template parse<16>(ptr + 2);
-  else if (ptr[0] == '0' && ptr[1] == 'b')
-    return P::template parse<2>(ptr + 2);
-  else
-    return P::template parse<10>(ptr);
+  if (ptr == nullptr)
+    return T();
+  if (ptr[0] == '0') {
+    if (ptr[1] == 'b')
+      return P::template parse<2>(ptr + 2);
+    if (ptr[1] == 'x')
+      return P::template parse<16>(ptr + 2);
+  }
+  return P::template parse<10>(ptr);
 }
 
 } // namespace internal
@@ -169,6 +172,16 @@ LIBC_INLINE constexpr auto operator""_u256(const char *x) {
   return internal::parse_with_prefix<UInt<256>>(x);
 }
 
+template <typename T> LIBC_INLINE constexpr T parse_bigint(const char *ptr) {
+  if (ptr == nullptr)
+    return T();
+  if (ptr[0] == '-' || ptr[0] == '+') {
+    auto positive = internal::parse_with_prefix<T>(ptr + 1);
+    return ptr[0] == '-' ? -positive : positive;
+  }
+  return internal::parse_with_prefix<T>(ptr);
+}
+
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_INTEGER_LITERALS_H
diff --git a/libc/src/__support/math_extras.h b/libc/src/__support/math_extras.h
index 70a8800b285d..4bd871957406 100644
--- a/libc/src/__support/math_extras.h
+++ b/libc/src/__support/math_extras.h
@@ -10,9 +10,9 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXTRAS_H
 #define LLVM_LIBC_SRC___SUPPORT_MATH_EXTRAS_H
 
-#include "src/__support/CPP/bit.h"           // countl_one, countr_zero
-#include "src/__support/CPP/limits.h"        // CHAR_BIT, numeric_limits
-#include "src/__support/CPP/type_traits.h"   // is_unsigned_v
+#include "src/__support/CPP/bit.h"         // countl_one, countr_zero
+#include "src/__support/CPP/limits.h"      // CHAR_BIT, numeric_limits
+#include "src/__support/CPP/type_traits.h" // is_unsigned_v, is_constant_evaluated
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 
 namespace LIBC_NAMESPACE {
@@ -32,199 +32,94 @@ mask_trailing_ones() {
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
 mask_leading_ones() {
-  constexpr T MASK(mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>());
-  return T(~MASK); // bitwise NOT performs integer promotion.
+  return T(~mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>());
 }
 
-// Add with carry
-template <typename T> struct SumCarry {
-  T sum;
-  T carry;
-};
-
-// This version is always valid for constexpr.
-template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, SumCarry<T>>
-add_with_carry_const(T a, T b, T carry_in) {
-  T tmp = a + carry_in;
-  T sum = b + tmp;
-  T carry_out = (sum < b) + (tmp < a);
-  return {sum, carry_out};
-}
-
-template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, SumCarry<T>>
-add_with_carry(T a, T b, T carry_in) {
-  return add_with_carry_const<T>(a, b, carry_in);
-}
-
-#if __has_builtin(__builtin_addc)
-// https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned char>
-add_with_carry<unsigned char>(unsigned char a, unsigned char b,
-                              unsigned char carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned char>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned char> result{0, 0};
-    result.sum = __builtin_addcb(a, b, carry_in, &result.carry);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned short>
-add_with_carry<unsigned short>(unsigned short a, unsigned short b,
-                               unsigned short carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned short>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned short> result{0, 0};
-    result.sum = __builtin_addcs(a, b, carry_in, &result.carry);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned int>
-add_with_carry<unsigned int>(unsigned int a, unsigned int b,
-                             unsigned int carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned int>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned int> result{0, 0};
-    result.sum = __builtin_addc(a, b, carry_in, &result.carry);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned long>
-add_with_carry<unsigned long>(unsigned long a, unsigned long b,
-                              unsigned long carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned long>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned long> result{0, 0};
-    result.sum = __builtin_addcl(a, b, carry_in, &result.carry);
-    return result;
-  }
+// Create a bitmask with the count right-most bits set to 0, and all other bits
+// set to 1.  Only unsigned types are allowed.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+mask_trailing_zeros() {
+  return mask_leading_ones<T, CHAR_BIT * sizeof(T) - count>();
 }
 
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned long long>
-add_with_carry<unsigned long long>(unsigned long long a, unsigned long long b,
-                                   unsigned long long carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned long long>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned long long> result{0, 0};
-    result.sum = __builtin_addcll(a, b, carry_in, &result.carry);
-    return result;
-  }
+// Create a bitmask with the count left-most bits set to 0, and all other bits
+// set to 1.  Only unsigned types are allowed.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+mask_leading_zeros() {
+  return mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>();
 }
 
-#endif // __has_builtin(__builtin_addc)
-
-// Subtract with borrow
-template <typename T> struct DiffBorrow {
-  T diff;
-  T borrow;
-};
-
-// This version is always valid for constexpr.
+// Returns whether 'a + b' overflows, the result is stored in 'res'.
 template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, DiffBorrow<T>>
-sub_with_borrow_const(T a, T b, T borrow_in) {
-  T tmp = a - b;
-  T diff = tmp - borrow_in;
-  T borrow_out = (diff > tmp) + (tmp > a);
-  return {diff, borrow_out};
+[[nodiscard]] LIBC_INLINE constexpr bool add_overflow(T a, T b, T &res) {
+  return __builtin_add_overflow(a, b, &res);
 }
 
-// This version is not always valid for constepxr because it's overriden below
-// if builtins are available.
+// Returns whether 'a - b' overflows, the result is stored in 'res'.
 template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, DiffBorrow<T>>
-sub_with_borrow(T a, T b, T borrow_in) {
-  return sub_with_borrow_const<T>(a, b, borrow_in);
-}
-
-#if __has_builtin(__builtin_subc)
-// https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
-
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned char>
-sub_with_borrow<unsigned char>(unsigned char a, unsigned char b,
-                               unsigned char borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned char>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned char> result{0, 0};
-    result.diff = __builtin_subcb(a, b, borrow_in, &result.borrow);
-    return result;
-  }
+[[nodiscard]] LIBC_INLINE constexpr bool sub_overflow(T a, T b, T &res) {
+  return __builtin_sub_overflow(a, b, &res);
 }
 
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned short>
-sub_with_borrow<unsigned short>(unsigned short a, unsigned short b,
-                                unsigned short borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned short>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned short> result{0, 0};
-    result.diff = __builtin_subcs(a, b, borrow_in, &result.borrow);
-    return result;
-  }
-}
+#define RETURN_IF(TYPE, BUILTIN)                                               \
+  if constexpr (cpp::is_same_v<T, TYPE>)                                       \
+    return BUILTIN(a, b, carry_in, carry_out);
 
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned int>
-sub_with_borrow<unsigned int>(unsigned int a, unsigned int b,
-                              unsigned int borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned int>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned int> result{0, 0};
-    result.diff = __builtin_subc(a, b, borrow_in, &result.borrow);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned long>
-sub_with_borrow<unsigned long>(unsigned long a, unsigned long b,
-                               unsigned long borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned long>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned long> result{0, 0};
-    result.diff = __builtin_subcl(a, b, borrow_in, &result.borrow);
-    return result;
+// Returns the result of 'a + b' taking into account 'carry_in'.
+// The carry out is stored in 'carry_out' it not 'nullptr', dropped otherwise.
+// We keep the pass by pointer interface for consistency with the intrinsic.
+template <typename T>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+add_with_carry(T a, T b, T carry_in, T &carry_out) {
+  if constexpr (!cpp::is_constant_evaluated()) {
+#if __has_builtin(__builtin_addcb)
+    RETURN_IF(unsigned char, __builtin_addcb)
+#elif __has_builtin(__builtin_addcs)
+    RETURN_IF(unsigned short, __builtin_addcs)
+#elif __has_builtin(__builtin_addc)
+    RETURN_IF(unsigned int, __builtin_addc)
+#elif __has_builtin(__builtin_addcl)
+    RETURN_IF(unsigned long, __builtin_addcl)
+#elif __has_builtin(__builtin_addcll)
+    RETURN_IF(unsigned long long, __builtin_addcll)
+#endif
   }
+  T sum = {};
+  T carry1 = add_overflow(a, b, sum);
+  T carry2 = add_overflow(sum, carry_in, sum);
+  carry_out = carry1 | carry2;
+  return sum;
 }
 
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned long long>
-sub_with_borrow<unsigned long long>(unsigned long long a, unsigned long long b,
-                                    unsigned long long borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned long long>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned long long> result{0, 0};
-    result.diff = __builtin_subcll(a, b, borrow_in, &result.borrow);
-    return result;
+// Returns the result of 'a - b' taking into account 'carry_in'.
+// The carry out is stored in 'carry_out' it not 'nullptr', dropped otherwise.
+// We keep the pass by pointer interface for consistency with the intrinsic.
+template <typename T>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+sub_with_borrow(T a, T b, T carry_in, T &carry_out) {
+  if constexpr (!cpp::is_constant_evaluated()) {
+#if __has_builtin(__builtin_subcb)
+    RETURN_IF(unsigned char, __builtin_subcb)
+#elif __has_builtin(__builtin_subcs)
+    RETURN_IF(unsigned short, __builtin_subcs)
+#elif __has_builtin(__builtin_subc)
+    RETURN_IF(unsigned int, __builtin_subc)
+#elif __has_builtin(__builtin_subcl)
+    RETURN_IF(unsigned long, __builtin_subcl)
+#elif __has_builtin(__builtin_subcll)
+    RETURN_IF(unsigned long long, __builtin_subcll)
+#endif
   }
+  T sub = {};
+  T carry1 = sub_overflow(a, b, sub);
+  T carry2 = sub_overflow(sub, carry_in, sub);
+  carry_out = carry1 | carry2;
+  return sub;
 }
 
-#endif // __has_builtin(__builtin_subc)
+#undef RETURN_IF
 
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, int>
diff --git a/libc/src/__support/number_pair.h b/libc/src/__support/number_pair.h
index ee6667b1299f..2f713fc03520 100644
--- a/libc/src/__support/number_pair.h
+++ b/libc/src/__support/number_pair.h
@@ -20,17 +20,6 @@ template <typename T> struct NumberPair {
   T hi = T(0);
 };
 
-template <typename T>
-cpp::enable_if_t<cpp::is_integral_v<T> && cpp::is_unsigned_v<T>,
-                 NumberPair<T>> constexpr split(T a) {
-  constexpr size_t HALF_BIT_WIDTH = sizeof(T) * 4;
-  constexpr T LOWER_HALF_MASK = (T(1) << HALF_BIT_WIDTH) - T(1);
-  NumberPair<T> result;
-  result.lo = a & LOWER_HALF_MASK;
-  result.hi = a >> HALF_BIT_WIDTH;
-  return result;
-}
-
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_NUMBER_PAIR_H
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index b67ee3a8e0df..c89792b8ac7b 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -88,6 +88,8 @@ add_math_entrypoint_object(expf)
 add_math_entrypoint_object(exp2)
 add_math_entrypoint_object(exp2f)
 
+add_math_entrypoint_object(exp2m1f)
+
 add_math_entrypoint_object(exp10)
 add_math_entrypoint_object(exp10f)
 
diff --git a/libc/src/math/exp2m1f.h b/libc/src/math/exp2m1f.h
new file mode 100644
index 000000000000..0eaf6b00e958
--- /dev/null
+++ b/libc/src/math/exp2m1f.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for exp2m1f -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_EXP2M1F_H
+#define LLVM_LIBC_SRC_MATH_EXP2M1F_H
+
+namespace LIBC_NAMESPACE {
+
+float exp2m1f(float x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_EXP2M1F_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index b164d33e204b..dc77f8b5ddba 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -838,6 +838,27 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  exp2m1f
+  SRCS
+    exp2m1f.cpp
+  HDRS
+    ../exp2m1f.h
+  DEPENDS
+    .explogxf
+    libc.src.errno.errno
+    libc.src.__support.common
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.cpu_features
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   exp10
   SRCS
     exp10.cpp
diff --git a/libc/src/math/generic/exp2m1f.cpp b/libc/src/math/generic/exp2m1f.cpp
new file mode 100644
index 000000000000..c60930dd782e
--- /dev/null
+++ b/libc/src/math/generic/exp2m1f.cpp
@@ -0,0 +1,183 @@
+//===-- Implementation of exp2m1f function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp2m1f.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+#include "src/errno/libc_errno.h"
+
+#include "explogxf.h"
+
+namespace LIBC_NAMESPACE {
+
+static constexpr size_t N_EXCEPTS_LO = 8;
+
+static constexpr fputil::ExceptValues<float, N_EXCEPTS_LO> EXP2M1F_EXCEPTS_LO =
+    {{
+        // (input, RZ output, RU offset, RD offset, RN offset)
+        // x = 0x1.36dc8ep-36, exp2m1f(x) = 0x1.aef212p-37 (RZ)
+        {0x2d9b'6e47U, 0x2d57'7909U, 1U, 0U, 0U},
+        // x = 0x1.224936p-19, exp2m1f(x) = 0x1.926c0ep-20 (RZ)
+        {0x3611'249bU, 0x35c9'3607U, 1U, 0U, 1U},
+        // x = 0x1.d16d2p-20, exp2m1f(x) = 0x1.429becp-20 (RZ)
+        {0x35e8'b690U, 0x35a1'4df6U, 1U, 0U, 1U},
+        // x = 0x1.17949ep-14, exp2m1f(x) = 0x1.8397p-15 (RZ)
+        {0x388b'ca4fU, 0x3841'cb80U, 1U, 0U, 1U},
+        // x = -0x1.9c3e1ep-38, exp2m1f(x) = -0x1.1dbeacp-38 (RZ)
+        {0xacce'1f0fU, 0xac8e'df56U, 0U, 1U, 0U},
+        // x = -0x1.4d89b4p-32, exp2m1f(x) = -0x1.ce61b6p-33 (RZ)
+        {0xafa6'c4daU, 0xaf67'30dbU, 0U, 1U, 1U},
+        // x = -0x1.a6eac4p-10, exp2m1f(x) = -0x1.24fadap-10 (RZ)
+        {0xbad3'7562U, 0xba92'7d6dU, 0U, 1U, 1U},
+        // x = -0x1.e7526ep-6, exp2m1f(x) = -0x1.4e53dep-6 (RZ)
+        {0xbcf3'a937U, 0xbca7'29efU, 0U, 1U, 1U},
+    }};
+
+static constexpr size_t N_EXCEPTS_HI = 3;
+
+static constexpr fputil::ExceptValues<float, N_EXCEPTS_HI> EXP2M1F_EXCEPTS_HI =
+    {{
+        // (input, RZ output, RU offset, RD offset, RN offset)
+        // x = 0x1.16a972p-1, exp2m1f(x) = 0x1.d545b2p-2 (RZ)
+        {0x3f0b'54b9U, 0x3eea'a2d9U, 1U, 0U, 0U},
+        // x = -0x1.9f12acp-5, exp2m1f(x) = -0x1.1ab68cp-5 (RZ)
+        {0xbd4f'8956U, 0xbd0d'5b46U, 0U, 1U, 0U},
+        // x = -0x1.de7b9cp-5, exp2m1f(x) = -0x1.4508f4p-5 (RZ)
+        {0xbd6f'3dceU, 0xbd22'847aU, 0U, 1U, 1U},
+    }};
+
+LLVM_LIBC_FUNCTION(float, exp2m1f, (float x)) {
+  using FPBits = fputil::FPBits<float>;
+  FPBits xbits(x);
+
+  uint32_t x_u = xbits.uintval();
+  uint32_t x_abs = x_u & 0x7fff'ffffU;
+
+  // When |x| >= 128, or x is nan, or |x| <= 2^-5
+  if (LIBC_UNLIKELY(x_abs >= 0x4300'0000U || x_abs <= 0x3d00'0000U)) {
+    // |x| <= 2^-5
+    if (x_abs <= 0x3d00'0000U) {
+      if (auto r = EXP2M1F_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+
+      // Minimax polynomial generated by Sollya with:
+      // > display = hexadecimal;
+      // > fpminimax((2^x - 1)/x, 5, [|D...|], [-2^-5, 2^-5]);
+      constexpr double COEFFS[] = {
+          0x1.62e42fefa39f3p-1, 0x1.ebfbdff82c57bp-3,  0x1.c6b08d6f2d7aap-5,
+          0x1.3b2ab6fc92f5dp-7, 0x1.5d897cfe27125p-10, 0x1.43090e61e6af1p-13};
+      double xd = x;
+      double xsq = xd * xd;
+      double c0 = fputil::multiply_add(xd, COEFFS[1], COEFFS[0]);
+      double c1 = fputil::multiply_add(xd, COEFFS[3], COEFFS[2]);
+      double c2 = fputil::multiply_add(xd, COEFFS[5], COEFFS[4]);
+      double p = fputil::polyeval(xsq, c0, c1, c2);
+      return static_cast<float>(p * xd);
+    }
+
+    // x >= 128, or x is nan
+    if (xbits.is_pos()) {
+      if (xbits.is_finite()) {
+        int rounding = fputil::quick_get_round();
+        if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+          return FPBits::max_normal().get_val();
+
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW);
+      }
+
+      // x >= 128 and 2^x - 1 rounds to +inf, or x is +inf or nan
+      return x + FPBits::inf().get_val();
+    }
+  }
+
+  if (LIBC_UNLIKELY(x <= -25.0f)) {
+    // 2^(-inf) - 1 = -1
+    if (xbits.is_inf())
+      return -1.0f;
+    // 2^nan - 1 = nan
+    if (xbits.is_nan())
+      return x;
+
+    int rounding = fputil::quick_get_round();
+    if (rounding == FE_UPWARD || rounding == FE_TOWARDZERO)
+      return -0x1.ffff'fep-1f; // -1.0f + 0x1.0p-24f
+
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_UNDERFLOW);
+    return -1.0f;
+  }
+
+  if (auto r = EXP2M1F_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+
+  // For -25 < x < 128, to compute 2^x, we perform the following range
+  // reduction: find hi, mid, lo such that:
+  //   x = hi + mid + lo, in which:
+  //     hi is an integer,
+  //     0 <= mid * 2^5 < 32 is an integer,
+  //     -2^(-6) <= lo <= 2^(-6).
+  // In particular,
+  //   hi + mid = round(x * 2^5) * 2^(-5).
+  // Then,
+  //   2^x = 2^(hi + mid + lo) = 2^hi * 2^mid * 2^lo.
+  // 2^mid is stored in the lookup table of 32 elements.
+  // 2^lo is computed using a degree-4 minimax polynomial generated by Sollya.
+  // We perform 2^hi * 2^mid by simply add hi to the exponent field of 2^mid.
+
+  // kf = (hi + mid) * 2^5 = round(x * 2^5)
+  float kf;
+  int k;
+#ifdef LIBC_TARGET_CPU_HAS_NEAREST_INT
+  kf = fputil::nearest_integer(x * 32.0f);
+  k = static_cast<int>(kf);
+#else
+  constexpr float HALF[2] = {0.5f, -0.5f};
+  k = static_cast<int>(fputil::multiply_add(x, 32.0f, HALF[x < 0.0f]));
+  kf = static_cast<float>(k);
+#endif // LIBC_TARGET_CPU_HAS_NEAREST_INT
+
+  // lo = x - (hi + mid) = x - kf * 2^(-5)
+  double lo = fputil::multiply_add(-0x1.0p-5f, kf, x);
+
+  // hi = floor(kf * 2^(-4))
+  // exp2_hi = shift hi to the exponent field of double precision.
+  int64_t exp2_hi =
+      static_cast<int64_t>(static_cast<uint64_t>(k >> ExpBase::MID_BITS)
+                           << fputil::FPBits<double>::FRACTION_LEN);
+  // mh = 2^hi * 2^mid
+  // mh_bits = bit field of mh
+  int64_t mh_bits = ExpBase::EXP_2_MID[k & ExpBase::MID_MASK] + exp2_hi;
+  double mh = fputil::FPBits<double>(static_cast<uint64_t>(mh_bits)).get_val();
+
+  // Degree-4 polynomial approximating (2^x - 1)/x generated by Sollya with:
+  // > display = hexadecimal;
+  // > fpminimax((2^x - 1)/x, 4, [|D...|], [-2^-6, 2^-6]);
+  constexpr double COEFFS[5] = {0x1.62e42fefa39efp-1, 0x1.ebfbdff8131c4p-3,
+                                0x1.c6b08d7061695p-5, 0x1.3b2b1bee74b2ap-7,
+                                0x1.5d88091198529p-10};
+  double lo_sq = lo * lo;
+  double c1 = fputil::multiply_add(lo, COEFFS[0], 1.0);
+  double c2 = fputil::multiply_add(lo, COEFFS[2], COEFFS[1]);
+  double c3 = fputil::multiply_add(lo, COEFFS[4], COEFFS[3]);
+  double exp2_lo = fputil::polyeval(lo_sq, c1, c2, c3);
+  // 2^x - 1 = 2^(hi + mid + lo) - 1
+  //         = 2^(hi + mid) * 2^lo - 1
+  //         ~ mh * (1 + lo * P(lo)) - 1
+  //         = mh * exp2_lo - 1
+  return static_cast<float>(fputil::multiply_add(exp2_lo, mh, -1.0));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/fseeko.h b/libc/src/stdio/fseeko.h
index 77fb41215c31..3202ed2f97d0 100644
--- a/libc/src/stdio/fseeko.h
+++ b/libc/src/stdio/fseeko.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_STDIO_FSEEKO_H
 
 #include <stdio.h>
+#include <unistd.h>
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdio/ftello.h b/libc/src/stdio/ftello.h
index 5ab17f9244a5..0fdf13ab6bdb 100644
--- a/libc/src/stdio/ftello.h
+++ b/libc/src/stdio/ftello.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_STDIO_FTELLO_H
 
 #include <stdio.h>
+#include <unistd.h>
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/test/src/__support/integer_literals_test.cpp b/libc/test/src/__support/integer_literals_test.cpp
index 5298cf30156e..cbc906aa7c99 100644
--- a/libc/test/src/__support/integer_literals_test.cpp
+++ b/libc/test/src/__support/integer_literals_test.cpp
@@ -133,3 +133,24 @@ TEST(LlvmLibcIntegerLiteralTest, u256) {
       U256_MAX,
       0xFFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF_u256);
 }
+
+TEST(LlvmLibcIntegerLiteralTest, parse_bigint) {
+  using T = LIBC_NAMESPACE::Int<128>;
+  struct {
+    const char *str;
+    T expected;
+  } constexpr TEST_CASES[] = {
+      {"0", 0}, {"-1", -1}, {"+1", 1}, {"-0xFF", -255}, {"-0b11", -3},
+  };
+  for (auto tc : TEST_CASES) {
+    T actual = LIBC_NAMESPACE::parse_bigint<T>(tc.str);
+    EXPECT_EQ(actual, tc.expected);
+  }
+}
+
+TEST(LlvmLibcIntegerLiteralTest, parse_bigint_invalid) {
+  using T = LIBC_NAMESPACE::Int<128>;
+  const T expected; // default construction
+  EXPECT_EQ(LIBC_NAMESPACE::parse_bigint<T>(nullptr), expected);
+  EXPECT_EQ(LIBC_NAMESPACE::parse_bigint<T>(""), expected);
+}
diff --git a/libc/test/src/__support/math_extras_test.cpp b/libc/test/src/__support/math_extras_test.cpp
index e88b3e1d6b68..401e631ea4ba 100644
--- a/libc/test/src/__support/math_extras_test.cpp
+++ b/libc/test/src/__support/math_extras_test.cpp
@@ -101,4 +101,61 @@ TYPED_TEST(LlvmLibcBitTest, CountZeros, UnsignedTypesNoBigInt) {
     EXPECT_EQ(count_zeros<T>(cpp::numeric_limits<T>::max() >> i), i);
 }
 
+using UnsignedTypes = testing::TypeList<
+#if defined(__SIZEOF_INT128__)
+    __uint128_t,
+#endif
+    unsigned char, unsigned short, unsigned int, unsigned long,
+    unsigned long long>;
+
+TYPED_TEST(LlvmLibcBlockMathExtrasTest, add_overflow, UnsignedTypes) {
+  constexpr T ZERO = cpp::numeric_limits<T>::min();
+  constexpr T ONE(1);
+  constexpr T MAX = cpp::numeric_limits<T>::max();
+  constexpr T BEFORE_MAX = MAX - 1;
+
+  const struct {
+    T lhs;
+    T rhs;
+    T sum;
+    bool carry;
+  } TESTS[] = {
+      {ZERO, ONE, ONE, false},       // 0x00 + 0x01 = 0x01
+      {BEFORE_MAX, ONE, MAX, false}, // 0xFE + 0x01 = 0xFF
+      {MAX, ONE, ZERO, true},        // 0xFF + 0x01 = 0x00 (carry)
+      {MAX, MAX, BEFORE_MAX, true},  // 0xFF + 0xFF = 0xFE (carry)
+  };
+  for (auto tc : TESTS) {
+    T sum;
+    bool carry = add_overflow<T>(tc.lhs, tc.rhs, sum);
+    EXPECT_EQ(sum, tc.sum);
+    EXPECT_EQ(carry, tc.carry);
+  }
+}
+
+TYPED_TEST(LlvmLibcBlockMathExtrasTest, sub_overflow, UnsignedTypes) {
+  constexpr T ZERO = cpp::numeric_limits<T>::min();
+  constexpr T ONE(1);
+  constexpr T MAX = cpp::numeric_limits<T>::max();
+  constexpr T BEFORE_MAX = MAX - 1;
+
+  const struct {
+    T lhs;
+    T rhs;
+    T sub;
+    bool carry;
+  } TESTS[] = {
+      {ONE, ZERO, ONE, false},      // 0x01 - 0x00 = 0x01
+      {MAX, MAX, ZERO, false},      // 0xFF - 0xFF = 0x00
+      {ZERO, ONE, MAX, true},       // 0x00 - 0x01 = 0xFF (carry)
+      {BEFORE_MAX, MAX, MAX, true}, // 0xFE - 0xFF = 0xFF (carry)
+  };
+  for (auto tc : TESTS) {
+    T sub;
+    bool carry = sub_overflow<T>(tc.lhs, tc.rhs, sub);
+    EXPECT_EQ(sub, tc.sub);
+    EXPECT_EQ(carry, tc.carry);
+  }
+}
+
 } // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp
index 5764324ca288..5696e54c73f3 100644
--- a/libc/test/src/__support/uint_test.cpp
+++ b/libc/test/src/__support/uint_test.cpp
@@ -8,6 +8,7 @@
 
 #include "src/__support/CPP/optional.h"
 #include "src/__support/UInt.h"
+#include "src/__support/integer_literals.h"        // parse_unsigned_bigint
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
 
 #include "include/llvm-libc-macros/math-macros.h" // HUGE_VALF, HUGE_VALF
@@ -15,6 +16,195 @@
 
 namespace LIBC_NAMESPACE {
 
+enum Value { ZERO, ONE, TWO, MIN, MAX };
+
+template <typename T> auto create(Value value) {
+  switch (value) {
+  case ZERO:
+    return T(0);
+  case ONE:
+    return T(1);
+  case TWO:
+    return T(2);
+  case MIN:
+    return T::min();
+  case MAX:
+    return T::max();
+  }
+}
+
+using Types = testing::TypeList< //
+#ifdef LIBC_TYPES_HAS_INT64
+    BigInt<64, false, uint64_t>, // 64-bits unsigned (1 x uint64_t)
+    BigInt<64, true, uint64_t>,  // 64-bits   signed (1 x uint64_t)
+#endif
+#ifdef LIBC_TYPES_HAS_INT128
+    BigInt<128, false, __uint128_t>, // 128-bits unsigned (1 x __uint128_t)
+    BigInt<128, true, __uint128_t>,  // 128-bits   signed (1 x __uint128_t)
+#endif
+    BigInt<16, false, uint16_t>, // 16-bits unsigned (1 x uint16_t)
+    BigInt<16, true, uint16_t>,  // 16-bits   signed (1 x uint16_t)
+    BigInt<64, false, uint16_t>, // 64-bits unsigned (4 x uint16_t)
+    BigInt<64, true, uint16_t>   // 64-bits   signed (4 x uint16_t)
+    >;
+
+#define ASSERT_SAME(A, B) ASSERT_TRUE((A) == (B))
+
+TYPED_TEST(LlvmLibcUIntClassTest, Additions, Types) {
+  ASSERT_SAME(create<T>(ZERO) + create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) + create<T>(ZERO), create<T>(ONE));
+  ASSERT_SAME(create<T>(ZERO) + create<T>(ONE), create<T>(ONE));
+  ASSERT_SAME(create<T>(ONE) + create<T>(ONE), create<T>(TWO));
+  // 2's complement addition works for signed and unsigned types.
+  // - unsigned : 0xff + 0x01 = 0x00 (255 + 1 = 0)
+  // -   signed : 0xef + 0x01 = 0xf0 (127 + 1 = -128)
+  ASSERT_SAME(create<T>(MAX) + create<T>(ONE), create<T>(MIN));
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Subtraction, Types) {
+  ASSERT_SAME(create<T>(ZERO) - create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) - create<T>(ONE), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) - create<T>(ZERO), create<T>(ONE));
+  // 2's complement subtraction works for signed and unsigned types.
+  // - unsigned : 0x00 - 0x01 = 0xff (   0 - 1 = 255)
+  // -   signed : 0xf0 - 0x01 = 0xef (-128 - 1 = 127)
+  ASSERT_SAME(create<T>(MIN) - create<T>(ONE), create<T>(MAX));
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Multiplication, Types) {
+  ASSERT_SAME(create<T>(ZERO) * create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ZERO) * create<T>(ONE), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) * create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) * create<T>(ONE), create<T>(ONE));
+  ASSERT_SAME(create<T>(ONE) * create<T>(TWO), create<T>(TWO));
+  ASSERT_SAME(create<T>(TWO) * create<T>(ONE), create<T>(TWO));
+  // - unsigned : 0xff x 0xff = 0x01 (mod 0xff)
+  // -   signed : 0xef x 0xef = 0x01 (mod 0xff)
+  ASSERT_SAME(create<T>(MAX) * create<T>(MAX), create<T>(ONE));
+}
+
+template <typename T> void print(const char *msg, T value) {
+  testing::tlog << msg;
+  IntegerToString<T, radix::Hex> buffer(value);
+  testing::tlog << buffer.view() << "\n";
+}
+
+TEST(LlvmLibcUIntClassTest, SignedAddSub) {
+  // Computations performed by https://www.wolframalpha.com/
+  using T = BigInt<128, true, uint32_t>;
+  const T a = parse_bigint<T>("1927508279017230597");
+  const T b = parse_bigint<T>("278789278723478925");
+  const T s = parse_bigint<T>("2206297557740709522");
+  // Addition
+  ASSERT_SAME(a + b, s);
+  ASSERT_SAME(b + a, s); // commutative
+  // Subtraction
+  ASSERT_SAME(a - s, -b);
+  ASSERT_SAME(s - a, b);
+}
+
+TEST(LlvmLibcUIntClassTest, SignedMulDiv) {
+  // Computations performed by https://www.wolframalpha.com/
+  using T = BigInt<128, true, uint16_t>;
+  struct {
+    const char *a;
+    const char *b;
+    const char *mul;
+  } const test_cases[] = {{"-4", "3", "-12"},
+                          {"-3", "-3", "9"},
+                          {"1927508279017230597", "278789278723478925",
+                           "537368642840747885329125014794668225"}};
+  for (auto tc : test_cases) {
+    const T a = parse_bigint<T>(tc.a);
+    const T b = parse_bigint<T>(tc.b);
+    const T mul = parse_bigint<T>(tc.mul);
+    // Multiplication
+    ASSERT_SAME(a * b, mul);
+    ASSERT_SAME(b * a, mul);   // commutative
+    ASSERT_SAME(a * -b, -mul); // sign
+    ASSERT_SAME(-a * b, -mul); // sign
+    ASSERT_SAME(-a * -b, mul); // sign
+    // Division
+    ASSERT_SAME(mul / a, b);
+    ASSERT_SAME(mul / b, a);
+    ASSERT_SAME(-mul / a, -b); // sign
+    ASSERT_SAME(mul / -a, -b); // sign
+    ASSERT_SAME(-mul / -a, b); // sign
+  }
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Division, Types) {
+  ASSERT_SAME(create<T>(ZERO) / create<T>(ONE), create<T>(ZERO));
+  ASSERT_SAME(create<T>(MAX) / create<T>(ONE), create<T>(MAX));
+  ASSERT_SAME(create<T>(MAX) / create<T>(MAX), create<T>(ONE));
+  ASSERT_SAME(create<T>(ONE) / create<T>(ONE), create<T>(ONE));
+  if constexpr (T::SIGNED) {
+    // Special case found by fuzzing.
+    ASSERT_SAME(create<T>(MIN) / create<T>(MIN), create<T>(ONE));
+  }
+  // - unsigned : 0xff / 0x02 = 0x7f
+  // -   signed : 0xef / 0x02 = 0x77
+  ASSERT_SAME(create<T>(MAX) / create<T>(TWO), (create<T>(MAX) >> 1));
+
+  using word_type = typename T::word_type;
+  const T zero_one_repeated = T::all_ones() / T(0xff);
+  const word_type pattern = word_type(~0) / word_type(0xff);
+  for (const word_type part : zero_one_repeated.val) {
+    if constexpr (T::SIGNED == false) {
+      EXPECT_EQ(part, pattern);
+    }
+  }
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, is_neg, Types) {
+  EXPECT_FALSE(create<T>(ZERO).is_neg());
+  EXPECT_FALSE(create<T>(ONE).is_neg());
+  EXPECT_FALSE(create<T>(TWO).is_neg());
+  EXPECT_EQ(create<T>(MIN).is_neg(), T::SIGNED);
+  EXPECT_FALSE(create<T>(MAX).is_neg());
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Masks, Types) {
+  if constexpr (!T::SIGNED) {
+    constexpr size_t BITS = T::BITS;
+    // mask_trailing_ones
+    ASSERT_SAME((mask_trailing_ones<T, 0>()), T::zero());
+    ASSERT_SAME((mask_trailing_ones<T, 1>()), T::one());
+    ASSERT_SAME((mask_trailing_ones<T, BITS - 1>()), T::all_ones() >> 1);
+    ASSERT_SAME((mask_trailing_ones<T, BITS>()), T::all_ones());
+    // mask_leading_ones
+    ASSERT_SAME((mask_leading_ones<T, 0>()), T::zero());
+    ASSERT_SAME((mask_leading_ones<T, 1>()), T::one() << (BITS - 1));
+    ASSERT_SAME((mask_leading_ones<T, BITS - 1>()), T::all_ones() - T::one());
+    ASSERT_SAME((mask_leading_ones<T, BITS>()), T::all_ones());
+    // mask_trailing_zeros
+    ASSERT_SAME((mask_trailing_zeros<T, 0>()), T::all_ones());
+    ASSERT_SAME((mask_trailing_zeros<T, 1>()), T::all_ones() - T::one());
+    ASSERT_SAME((mask_trailing_zeros<T, BITS - 1>()), T::one() << (BITS - 1));
+    ASSERT_SAME((mask_trailing_zeros<T, BITS>()), T::zero());
+    // mask_trailing_zeros
+    ASSERT_SAME((mask_leading_zeros<T, 0>()), T::all_ones());
+    ASSERT_SAME((mask_leading_zeros<T, 1>()), T::all_ones() >> 1);
+    ASSERT_SAME((mask_leading_zeros<T, BITS - 1>()), T::one());
+    ASSERT_SAME((mask_leading_zeros<T, BITS>()), T::zero());
+  }
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, CountBits, Types) {
+  if constexpr (!T::SIGNED) {
+    for (size_t i = 0; i <= T::BITS; ++i) {
+      const auto l_one = T::all_ones() << i; // 0b111...000
+      const auto r_one = T::all_ones() >> i; // 0b000...111
+      const int zeros = i;
+      const int ones = T::BITS - zeros;
+      ASSERT_EQ(cpp::countr_one(r_one), ones);
+      ASSERT_EQ(cpp::countl_one(l_one), ones);
+      ASSERT_EQ(cpp::countr_zero(l_one), zeros);
+      ASSERT_EQ(cpp::countl_zero(r_one), zeros);
+    }
+  }
+}
+
 using LL_UInt64 = UInt<64>;
 // We want to test UInt<128> explicitly. So, for
 // convenience, we use a sugar which does not conflict with the UInt128 type
@@ -561,7 +751,7 @@ TEST(LlvmLibcUIntClassTest, FullMulTests) {
     LL_UInt##Bits a = ~LL_UInt##Bits(0);                                       \
     LL_UInt##Bits hi = a.quick_mul_hi(a);                                      \
     LL_UInt##Bits trunc = static_cast<LL_UInt##Bits>(a.ful_mul(a) >> Bits);    \
-    uint64_t overflow = trunc.sub(hi);                                         \
+    uint64_t overflow = trunc.sub_overflow(hi);                                \
     EXPECT_EQ(overflow, uint64_t(0));                                          \
     EXPECT_LE(uint64_t(trunc), uint64_t(Error));                               \
   } while (0)
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index f8f0f8ba7b6f..bbf8f071e1e0 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -638,6 +638,21 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  exp2m1f_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    exp2m1f_test.cpp
+  DEPENDS
+    libc.include.llvm-libc-macros.math_macros
+    libc.src.errno.errno
+    libc.src.math.exp2m1f
+    libc.src.__support.CPP.array
+    libc.src.__support.FPUtil.fp_bits
+)
+
+add_fp_unittest(
   exp10f_test
   NEED_MPFR
   SUITE
diff --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt
index df32dd4f943f..6b2f3dddcadd 100644
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@@ -143,6 +143,21 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  exp2m1f_test
+  NO_RUN_POSTBUILD
+  NEED_MPFR
+  SUITE
+    libc_math_exhaustive_tests
+  SRCS
+    exp2m1f_test.cpp
+  DEPENDS
+    .exhaustive_test
+    libc.src.math.exp2m1f
+  LINK_LIBRARIES
+    -lpthread
+)
+
+add_fp_unittest(
   exp10f_test
   NO_RUN_POSTBUILD
   NEED_MPFR
diff --git a/libc/test/src/math/exhaustive/exp2m1f_test.cpp b/libc/test/src/math/exhaustive/exp2m1f_test.cpp
new file mode 100644
index 000000000000..2111024cb5c0
--- /dev/null
+++ b/libc/test/src/math/exhaustive/exp2m1f_test.cpp
@@ -0,0 +1,33 @@
+//===-- Exhaustive test for exp2m1f ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "exhaustive_test.h"
+#include "src/math/exp2m1f.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+using LlvmLibcExp2m1fExhaustiveTest =
+    LlvmLibcUnaryOpExhaustiveMathTest<float, mpfr::Operation::Exp2m1,
+                                      LIBC_NAMESPACE::exp2m1f>;
+
+// Range: [0, Inf];
+static constexpr uint32_t POS_START = 0x0000'0000U;
+static constexpr uint32_t POS_STOP = 0x7f80'0000U;
+
+TEST_F(LlvmLibcExp2m1fExhaustiveTest, PostiveRange) {
+  test_full_range_all_roundings(POS_START, POS_STOP);
+}
+
+// Range: [-Inf, 0];
+static constexpr uint32_t NEG_START = 0x8000'0000U;
+static constexpr uint32_t NEG_STOP = 0xff80'0000U;
+
+TEST_F(LlvmLibcExp2m1fExhaustiveTest, NegativeRange) {
+  test_full_range_all_roundings(NEG_START, NEG_STOP);
+}
diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp
new file mode 100644
index 000000000000..a0f0da868117
--- /dev/null
+++ b/libc/test/src/math/exp2m1f_test.cpp
@@ -0,0 +1,66 @@
+//===-- Unittests for exp2m1f ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/llvm-libc-macros/math-macros.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/errno/libc_errno.h"
+#include "src/math/exp2m1f.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+#include <stdint.h>
+
+using LlvmLibcExp2m1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+TEST_F(LlvmLibcExp2m1fTest, TrickyInputs) {
+  constexpr LIBC_NAMESPACE::cpp::array<float, 10> INPUTS = {
+      // EXP2M1F_EXCEPTS_LO
+      0x1.36dc8ep-36,
+      0x1.224936p-19,
+      0x1.d16d2p-20,
+      0x1.17949ep-14,
+      -0x1.9c3e1ep-38,
+      -0x1.4d89b4p-32,
+      -0x1.a6eac4p-10,
+      -0x1.e7526ep-6,
+      // EXP2M1F_EXCEPTS_HI
+      0x1.16a972p-1,
+      -0x1.9f12acp-5,
+  };
+
+  for (float x : INPUTS) {
+    LIBC_NAMESPACE::libc_errno = 0;
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
+                                   LIBC_NAMESPACE::exp2m1f(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcExp2m1fTest, InFloatRange) {
+  constexpr uint32_t COUNT = 100'000;
+  constexpr uint32_t STEP = UINT32_MAX / COUNT;
+  for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
+    float x = FPBits(v).get_val();
+    if (isnan(x) || isinf(x))
+      continue;
+    LIBC_NAMESPACE::libc_errno = 0;
+    float result = LIBC_NAMESPACE::exp2m1f(x);
+
+    // If the computation resulted in an error or did not produce valid result
+    // in the single-precision floating point range, then ignore comparing with
+    // MPFR result as MPFR can still produce valid results because of its
+    // wider precision.
+    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+      continue;
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
+                                   LIBC_NAMESPACE::exp2m1f(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 5d269ddb229c..4ac1842cf5fa 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -774,6 +774,17 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  exp2m1f_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    exp2m1f_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.exp2m1f
+)
+
+add_fp_unittest(
   exp10f_test
   SUITE
     libc-math-smoke-tests
diff --git a/libc/test/src/math/smoke/exp2m1f_test.cpp b/libc/test/src/math/smoke/exp2m1f_test.cpp
new file mode 100644
index 000000000000..2df435385247
--- /dev/null
+++ b/libc/test/src/math/smoke/exp2m1f_test.cpp
@@ -0,0 +1,63 @@
+//===-- Unittests for exp2m1f ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/errno/libc_errno.h"
+#include "src/math/exp2m1f.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcExp2m1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+using LIBC_NAMESPACE::fputil::testing::ForceRoundingMode;
+using LIBC_NAMESPACE::fputil::testing::RoundingMode;
+
+TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2m1f(aNaN));
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::exp2m1f(inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(-1.0f, LIBC_NAMESPACE::exp2m1f(neg_inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(0.0f, LIBC_NAMESPACE::exp2m1f(0.0f));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, LIBC_NAMESPACE::exp2m1f(-0.0f));
+
+  EXPECT_FP_EQ_ALL_ROUNDING(1.0f, LIBC_NAMESPACE::exp2m1f(1.0f));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0.5f, LIBC_NAMESPACE::exp2m1f(-1.0f));
+  EXPECT_FP_EQ_ALL_ROUNDING(3.0f, LIBC_NAMESPACE::exp2m1f(2.0f));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0.75f, LIBC_NAMESPACE::exp2m1f(-2.0f));
+}
+
+TEST_F(LlvmLibcExp2m1fTest, Overflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f(0x1.fffffep+127),
+                              FE_OVERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f(128.0f),
+                              FE_OVERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f(0x1.000002p+7),
+                              FE_OVERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+}
+
+TEST_F(LlvmLibcExp2m1fTest, Underflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp2m1f(-0x1.fffffep+127),
+                              FE_UNDERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp2m1f(-25.0f),
+                              FE_UNDERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp2m1f(-0x1.900002p4),
+                              FE_UNDERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+}
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index a83f7a7ceb92..eaa47da6bda2 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -89,7 +89,8 @@ public:
   // precision.
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<float, XType>, int> = 0>
-  explicit MPFRNumber(XType x, int precision = ExtraPrecision<XType>::VALUE,
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<XType>::VALUE,
                       RoundingMode rounding = RoundingMode::Nearest)
       : mpfr_precision(precision),
         mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
@@ -99,7 +100,8 @@ public:
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<double, XType>, int> = 0>
-  explicit MPFRNumber(XType x, int precision = ExtraPrecision<XType>::VALUE,
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<XType>::VALUE,
                       RoundingMode rounding = RoundingMode::Nearest)
       : mpfr_precision(precision),
         mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
@@ -109,7 +111,8 @@ public:
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<long double, XType>, int> = 0>
-  explicit MPFRNumber(XType x, int precision = ExtraPrecision<XType>::VALUE,
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<XType>::VALUE,
                       RoundingMode rounding = RoundingMode::Nearest)
       : mpfr_precision(precision),
         mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
@@ -119,7 +122,8 @@ public:
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_integral_v<XType>, int> = 0>
-  explicit MPFRNumber(XType x, int precision = ExtraPrecision<float>::VALUE,
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<float>::VALUE,
                       RoundingMode rounding = RoundingMode::Nearest)
       : mpfr_precision(precision),
         mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
@@ -134,6 +138,12 @@ public:
     mpfr_set(value, other.value, mpfr_rounding);
   }
 
+  MPFRNumber(const MPFRNumber &other, unsigned int precision)
+      : mpfr_precision(precision), mpfr_rounding(other.mpfr_rounding) {
+    mpfr_init2(value, mpfr_precision);
+    mpfr_set(value, other.value, mpfr_rounding);
+  }
+
   ~MPFRNumber() { mpfr_clear(value); }
 
   MPFRNumber &operator=(const MPFRNumber &rhs) {
@@ -229,6 +239,36 @@ public:
     return result;
   }
 
+  MPFRNumber exp2m1() const {
+    // TODO: Only use mpfr_exp2m1 once CI and buildbots get MPFR >= 4.2.0.
+#if MPFR_VERSION_MAJOR > 4 ||                                                  \
+    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+    MPFRNumber result(*this);
+    mpfr_exp2m1(result.value, value, mpfr_rounding);
+    return result;
+#else
+    unsigned int prec = mpfr_precision * 3;
+    MPFRNumber result(*this, prec);
+
+    float f = mpfr_get_flt(abs().value, mpfr_rounding);
+    if (f > 0.5f && f < 0x1.0p30f) {
+      mpfr_exp2(result.value, value, mpfr_rounding);
+      mpfr_sub_ui(result.value, result.value, 1, mpfr_rounding);
+      return result;
+    }
+
+    MPFRNumber ln2(2.0f, prec);
+    // log(2)
+    mpfr_log(ln2.value, ln2.value, mpfr_rounding);
+    // x * log(2)
+    mpfr_mul(result.value, value, ln2.value, mpfr_rounding);
+    // e^(x * log(2)) - 1
+    int ex = mpfr_expm1(result.value, result.value, mpfr_rounding);
+    mpfr_subnormalize(result.value, ex, mpfr_rounding);
+    return result;
+#endif
+  }
+
   MPFRNumber exp10() const {
     MPFRNumber result(*this);
     mpfr_exp10(result.value, value, mpfr_rounding);
@@ -570,6 +610,8 @@ unary_operation(Operation op, InputType input, unsigned int precision,
     return mpfrInput.exp();
   case Operation::Exp2:
     return mpfrInput.exp2();
+  case Operation::Exp2m1:
+    return mpfrInput.exp2m1();
   case Operation::Exp10:
     return mpfrInput.exp10();
   case Operation::Expm1:
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index d5ff590cd7bb..0a41ac639798 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -37,6 +37,7 @@ enum class Operation : int {
   Erf,
   Exp,
   Exp2,
+  Exp2m1,
   Exp10,
   Expm1,
   Floor,
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 9236f09d3667..21e5cac68822 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -20,22 +20,7 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
   spirv64/lib/SOURCES
 )
 
-# List of all targets
-set( LIBCLC_TARGETS_ALL
-  amdgcn--
-  amdgcn--amdhsa
-  clspv--
-  clspv64--
-  r600--
-  nvptx--
-  nvptx64--
-  nvptx--nvidiacl
-  nvptx64--nvidiacl
-  spirv-mesa3d-
-  spirv64-mesa3d-
-)
-
-set( LIBCLC_MIN_LLVM "3.9.0" )
+set( LIBCLC_MIN_LLVM 3.9.0 )
 
 set( LIBCLC_TARGETS_TO_BUILD "all"
     CACHE STRING "Semicolon-separated list of targets to build, or 'all'." )
@@ -47,19 +32,10 @@ include(AddLLVM)
 
 message( STATUS "libclc LLVM version: ${LLVM_PACKAGE_VERSION}" )
 
-if( ${LLVM_PACKAGE_VERSION} VERSION_LESS ${LIBCLC_MIN_LLVM} )
+if( LLVM_PACKAGE_VERSION VERSION_LESS LIBCLC_MIN_LLVM )
   message( FATAL_ERROR "libclc needs at least LLVM ${LIBCLC_MIN_LLVM}" )
 endif()
 
-# mesa3d environment is only available since LLVM 4.0
-if( ${LLVM_PACKAGE_VERSION} VERSION_GREATER "3.9.0" )
-  set( LIBCLC_TARGETS_ALL ${LIBCLC_TARGETS_ALL} amdgcn-mesa-mesa3d )
-endif()
-
-if( LIBCLC_TARGETS_TO_BUILD STREQUAL "all" )
-  set( LIBCLC_TARGETS_TO_BUILD ${LIBCLC_TARGETS_ALL} )
-endif()
-
 find_program( LLVM_CLANG clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
 find_program( LLVM_AS llvm-as PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
 find_program( LLVM_LINK llvm-link PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
@@ -76,15 +52,45 @@ if( NOT LLVM_CLANG OR NOT LLVM_OPT OR NOT LLVM_AS OR NOT LLVM_LINK )
   message( FATAL_ERROR "libclc toolchain incomplete!" )
 endif()
 
+# List of all targets. Note that some are added dynamically below.
+set( LIBCLC_TARGETS_ALL
+  amdgcn--
+  amdgcn--amdhsa
+  clspv--
+  clspv64--
+  r600--
+  nvptx--
+  nvptx64--
+  nvptx--nvidiacl
+  nvptx64--nvidiacl
+)
+
+# mesa3d environment is only available since LLVM 4.0
+if( LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL 4.0.0 )
+  list( APPEND LIBCLC_TARGETS_ALL amdgcn-mesa-mesa3d )
+endif()
+
+# spirv-mesa3d and spirv64-mesa3d targets can only be built with the (optional)
+# llvm-spirv external tool.
+if( LLVM_SPIRV )
+  list( APPEND LIBCLC_TARGETS_ALL  spirv-mesa3d- spirv64-mesa3d- )
+endif()
+
+if( LIBCLC_TARGETS_TO_BUILD STREQUAL "all" )
+  set( LIBCLC_TARGETS_TO_BUILD ${LIBCLC_TARGETS_ALL} )
+endif()
+
 list( SORT LIBCLC_TARGETS_TO_BUILD )
 
+# Verify that the user hasn't requested mesa3d targets without an available
+# llvm-spirv tool.
 if( "spirv-mesa3d-" IN_LIST LIBCLC_TARGETS_TO_BUILD OR "spirv64-mesa3d-" IN_LIST LIBCLC_TARGETS_TO_BUILD )
   if( NOT LLVM_SPIRV )
     message( FATAL_ERROR "SPIR-V targets requested, but spirv-tools is not installed" )
   endif()
 endif()
 
-set( CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake )
+set( CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake )
 set( CMAKE_CLC_COMPILER ${LLVM_CLANG} )
 set( CMAKE_CLC_ARCHIVE ${LLVM_LINK} )
 set( CMAKE_LLAsm_PREPROCESSOR ${LLVM_CLANG} )
@@ -96,7 +102,7 @@ set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MIN
 
 
 # LLVM 13 enables standard includes by default
-if( ${LLVM_PACKAGE_VERSION} VERSION_GREATER "12.99.99" )
+if( LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL 13.0.0 )
   set( CMAKE_LLAsm_FLAGS "${CMAKE_LLAsm_FLAGS} -cl-no-stdinc" )
   set( CMAKE_CLC_FLAGS "${CMAKE_CLC_FLAGS} -cl-no-stdinc" )
 endif()
@@ -113,9 +119,10 @@ set(LLVM_LINK_COMPONENTS
   BitReader
   BitWriter
   Core
+  IRReader
   Support
 )
-add_llvm_executable( prepare_builtins utils/prepare-builtins.cpp )
+add_llvm_utility( prepare_builtins utils/prepare-builtins.cpp )
 target_compile_definitions( prepare_builtins PRIVATE ${LLVM_VERSION_DEFINE} )
 # These were not properly reported in early LLVM and we don't need them
 target_compile_options( prepare_builtins PRIVATE -fno-rtti -fno-exceptions )
@@ -165,7 +172,7 @@ if( ENABLE_RUNTIME_SUBNORMAL )
 endif()
 
 find_package( Python3 REQUIRED COMPONENTS Interpreter )
-file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/generic/lib/gen_convert.py script_loc )
+file( TO_CMAKE_PATH ${PROJECT_SOURCE_DIR}/generic/lib/gen_convert.py script_loc )
 add_custom_command(
   OUTPUT convert.cl
   COMMAND ${Python3_EXECUTABLE} ${script_loc} > convert.cl
@@ -198,7 +205,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
     list( APPEND dirs amdgpu )
   endif()
 
-  #nvptx is special
+  # nvptx is special
   if( ${ARCH} STREQUAL nvptx OR ${ARCH} STREQUAL nvptx64 )
     set( DARCH ptx )
   else()
@@ -210,7 +217,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
   foreach( l ${dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} )
     foreach( s "SOURCES" "SOURCES_${LLVM_MAJOR}.${LLVM_MINOR}" )
       file( TO_CMAKE_PATH ${l}/lib/${s} file_loc )
-      file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/${file_loc} loc )
+      file( TO_CMAKE_PATH ${PROJECT_SOURCE_DIR}/${file_loc} loc )
       # Prepend the location to give higher priority to
       # specialized implementation
       if( EXISTS ${loc} )
@@ -219,8 +226,8 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
     endforeach()
   endforeach()
 
-  # Add the generated convert.cl here to prevent adding
-  # the one listed in SOURCES
+  # Add the generated convert.cl here to prevent adding the one listed in
+  # SOURCES
   if( NOT ${ARCH} STREQUAL "spirv" AND NOT ${ARCH} STREQUAL "spirv64" )
     if( NOT ENABLE_RUNTIME_SUBNORMAL AND NOT ${ARCH} STREQUAL "clspv" AND
         NOT ${ARCH} STREQUAL "clspv64" )
@@ -246,7 +253,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
         list( APPEND objects ${f} )
         list( APPEND rel_files ${dir}/${f} )
         # FIXME: This should really go away
-        file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/${dir}/${f} src_loc )
+        file( TO_CMAKE_PATH ${PROJECT_SOURCE_DIR}/${dir}/${f} src_loc )
         get_filename_component( fdir ${src_loc} DIRECTORY )
 
         set_source_files_properties( ${dir}/${f}
@@ -288,53 +295,56 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       set( opt_flags -O3 )
     endif()
 
-    add_library( builtins.link.${arch_suffix} STATIC ${rel_files} )
+    set( builtins_link_lib_tgt builtins.link.${arch_suffix} )
+    add_library( ${builtins_link_lib_tgt} STATIC ${rel_files} )
     # Make sure we depend on the pseudo target to prevent
     # multiple invocations
-    add_dependencies( builtins.link.${arch_suffix} generate_convert.cl )
-    add_dependencies( builtins.link.${arch_suffix} clspv-generate_convert.cl )
+    add_dependencies( ${builtins_link_lib_tgt} generate_convert.cl )
+    add_dependencies( ${builtins_link_lib_tgt} clspv-generate_convert.cl )
     # CMake will turn this include into absolute path
-    target_include_directories( builtins.link.${arch_suffix} PRIVATE
+    target_include_directories( ${builtins_link_lib_tgt} PRIVATE
       "generic/include" )
-    target_compile_definitions( builtins.link.${arch_suffix} PRIVATE
+    target_compile_definitions( ${builtins_link_lib_tgt} PRIVATE
       "__CLC_INTERNAL" )
     string( TOUPPER "-DCLC_${ARCH}" CLC_TARGET_DEFINE )
-    target_compile_definitions( builtins.link.${arch_suffix} PRIVATE
+    target_compile_definitions( ${builtins_link_lib_tgt} PRIVATE
       ${CLC_TARGET_DEFINE} )
-    target_compile_options( builtins.link.${arch_suffix} PRIVATE  -target
+    target_compile_options( ${builtins_link_lib_tgt} PRIVATE  -target
       ${t} ${mcpu} -fno-builtin -nostdlib ${build_flags} )
-    set_target_properties( builtins.link.${arch_suffix} PROPERTIES
+    set_target_properties( ${builtins_link_lib_tgt} PROPERTIES
       LINKER_LANGUAGE CLC )
 
     set( obj_suffix ${arch_suffix}.bc )
+    set( builtins_opt_lib_tgt builtins.opt.${obj_suffix} )
 
     # Add opt target
-    add_custom_command( OUTPUT "builtins.opt.${obj_suffix}"
-      COMMAND ${LLVM_OPT} ${opt_flags} -o "builtins.opt.${obj_suffix}" "builtins.link.${obj_suffix}"
-      DEPENDS "builtins.link.${arch_suffix}" )
+    add_custom_command( OUTPUT ${builtins_opt_lib_tgt}
+      COMMAND ${LLVM_OPT} ${opt_flags} -o ${builtins_opt_lib_tgt}
+        $<TARGET_FILE:${builtins_link_lib_tgt}>
+      DEPENDS ${builtins_link_lib_tgt} )
     add_custom_target( "opt.${obj_suffix}" ALL
-      DEPENDS "builtins.opt.${obj_suffix}" )
+      DEPENDS ${builtins_opt_lib_tgt} )
 
     if( ${ARCH} STREQUAL "spirv" OR ${ARCH} STREQUAL "spirv64" )
       set( spv_suffix ${arch_suffix}.spv )
       add_custom_command( OUTPUT "${spv_suffix}"
-        COMMAND ${LLVM_SPIRV} ${spvflags} -o "${spv_suffix}" "builtins.link.${obj_suffix}"
-        DEPENDS "builtins.link.${arch_suffix}" )
+        COMMAND ${LLVM_SPIRV} ${spvflags} -o "${spv_suffix}" ${builtins_opt_lib_tgt}
+        DEPENDS ${builtins_link_lib_tgt} )
       add_custom_target( "prepare-${spv_suffix}" ALL DEPENDS "${spv_suffix}" )
       install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix}
          DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
     else()
       # Add prepare target
       add_custom_command( OUTPUT "${obj_suffix}"
-        COMMAND prepare_builtins -o "${obj_suffix}" "builtins.opt.${obj_suffix}"
-        DEPENDS "opt.${obj_suffix}" "builtins.opt.${obj_suffix}" prepare_builtins )
+        COMMAND prepare_builtins -o "${obj_suffix}" ${builtins_opt_lib_tgt}
+        DEPENDS "opt.${obj_suffix}" ${builtins_opt_lib_tgt} prepare_builtins )
       add_custom_target( "prepare-${obj_suffix}" ALL DEPENDS "${obj_suffix}" )
 
       # nvptx-- targets don't include workitem builtins
       if( NOT ${t} MATCHES ".*ptx.*--$" )
         add_test( NAME external-calls-${obj_suffix}
           COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR}
-          WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} )
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} )
       endif()
 
       install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index dd39c1bbbc78..2da9df54a531 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -42,6 +42,7 @@ Implemented Papers
 - P2652R2 - Disallow User Specialization of ``allocator_traits``
 - P2819R2 - Add ``tuple`` protocol to ``complex``
 - P2495R3 - Interfacing ``stringstream``\s with ``string_view``
+- P2867R2 - Remove Deprecated ``strstream``\s From C++26
 - P2302R4 - ``std::ranges::contains``
 - P1659R3 - ``std::ranges::starts_with`` and ``std::ranges::ends_with``
 
@@ -54,6 +55,8 @@ Improvements and New Features
 - The ``std::mismatch`` algorithm has been optimized for integral types, which can lead up to 40x performance
   improvements.
 
+- The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM`` macro has been added to make the declarations in ``<strstream>`` available.
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 6e82086d5469..a34dad581680 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -47,7 +47,7 @@
 "`P1673R13 <https://wg21.link/P1673R13>`__","LWG","A free function linear algebra interface based on the BLAS","Kona November 2023","","",""
 "","","","","","",""
 "`P2875R4 <https://wg21.link/P2875R4>`__","LWG","Undeprecate ``polymorphic_allocator::destroy`` for C++26","Tokyo March 2024","|Complete|","15.0",""
-"`P2867R2 <https://wg21.link/P2867R2>`__","LWG","Remove Deprecated ``strstreams`` From C++26","Tokyo March 2024","","",""
+"`P2867R2 <https://wg21.link/P2867R2>`__","LWG","Remove Deprecated ``strstreams`` From C++26","Tokyo March 2024","|Complete|","19.0",""
 "`P2869R4 <https://wg21.link/P2869R4>`__","LWG","Remove Deprecated ``shared_ptr`` Atomic Access APIs from C++26","Tokyo March 2024","","",""
 "`P2872R3 <https://wg21.link/P2872R3>`__","LWG","Remove ``wstring_convert`` From C++26","Tokyo March 2024","","",""
 "`P3107R5 <https://wg21.link/P3107R5>`__","LWG","Permit an efficient implementation of ``std::print``","Tokyo March 2024","","","|format| |DR|"
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index ac12b0b96950..8a1c747a2541 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -272,7 +272,10 @@ C++26 Specific Configuration Macros
   ``std::basic_string<...>::reserve()``.
 
 **_LIBCPP_ENABLE_CXX26_REMOVED_ALLOCATOR_MEMBERS**:
-  This macro is used to re-enable redundant member of ``allocator<T>::is_always_equal``
+  This macro is used to re-enable redundant member of ``allocator<T>::is_always_equal``.
+
+**_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM**:
+  This macro is used to re-enable all named declarations in ``<strstream>``.
 
 Libc++ Extensions
 =================
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index db55c6f02a3d..743f99297d17 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -134,7 +134,7 @@ velocity, libc++ drops support for older compilers as newer ones are released.
 ============ =============== ========================== =====================
 Compiler     Versions        Restrictions               Support policy
 ============ =============== ========================== =====================
-Clang        16, 17, 18-git                             latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
+Clang        17, 18, 19-git                             latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
 AppleClang   15                                         latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
 Open XL      17.1 (AIX)                                 latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_
 GCC          13              In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index db3980342f50..097a41d4c417 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -738,6 +738,7 @@ set(files
   __type_traits/datasizeof.h
   __type_traits/decay.h
   __type_traits/dependent_type.h
+  __type_traits/desugars_to.h
   __type_traits/disjunction.h
   __type_traits/enable_if.h
   __type_traits/extent.h
@@ -822,7 +823,6 @@ set(files
   __type_traits/nat.h
   __type_traits/negation.h
   __type_traits/noexcept_move_assign_container.h
-  __type_traits/operation_traits.h
   __type_traits/promote.h
   __type_traits/rank.h
   __type_traits/remove_all_extents.h
diff --git a/libcxx/include/__algorithm/comp.h b/libcxx/include/__algorithm/comp.h
index 3902f7560304..a089375e3da1 100644
--- a/libcxx/include/__algorithm/comp.h
+++ b/libcxx/include/__algorithm/comp.h
@@ -10,8 +10,7 @@
 #define _LIBCPP___ALGORITHM_COMP_H
 
 #include <__config>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/operation_traits.h>
+#include <__type_traits/desugars_to.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -27,7 +26,7 @@ struct __equal_to {
 };
 
 template <class _Tp, class _Up>
-struct __desugars_to<__equal_tag, __equal_to, _Tp, _Up> : true_type {};
+inline const bool __desugars_to_v<__equal_tag, __equal_to, _Tp, _Up> = true;
 
 // The definition is required because __less is part of the ABI, but it's empty
 // because all comparisons should be transparent.
diff --git a/libcxx/include/__algorithm/equal.h b/libcxx/include/__algorithm/equal.h
index c76a16b47f5d..1341d9e4159b 100644
--- a/libcxx/include/__algorithm/equal.h
+++ b/libcxx/include/__algorithm/equal.h
@@ -18,12 +18,11 @@
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__string/constexpr_c_functions.h>
+#include <__type_traits/desugars_to.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/integral_constant.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_equality_comparable.h>
 #include <__type_traits/is_volatile.h>
-#include <__type_traits/operation_traits.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -47,7 +46,7 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 boo
 template <class _Tp,
           class _Up,
           class _BinaryPredicate,
-          __enable_if_t<__desugars_to<__equal_tag, _BinaryPredicate, _Tp, _Up>::value && !is_volatile<_Tp>::value &&
+          __enable_if_t<__desugars_to_v<__equal_tag, _BinaryPredicate, _Tp, _Up> && !is_volatile<_Tp>::value &&
                             !is_volatile<_Up>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value,
                         int> = 0>
 _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
@@ -87,7 +86,7 @@ template <class _Tp,
           class _Pred,
           class _Proj1,
           class _Proj2,
-          __enable_if_t<__desugars_to<__equal_tag, _Pred, _Tp, _Up>::value && __is_identity<_Proj1>::value &&
+          __enable_if_t<__desugars_to_v<__equal_tag, _Pred, _Tp, _Up> && __is_identity<_Proj1>::value &&
                             __is_identity<_Proj2>::value && !is_volatile<_Tp>::value && !is_volatile<_Up>::value &&
                             __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value,
                         int> = 0>
diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index 8abb273ac178..4ada29eabc47 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -16,11 +16,11 @@
 #include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__functional/identity.h>
+#include <__type_traits/desugars_to.h>
 #include <__type_traits/invoke.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_equality_comparable.h>
 #include <__type_traits/is_integral.h>
-#include <__type_traits/operation_traits.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/unreachable.h>
@@ -59,7 +59,7 @@ template <class _Tp,
           class _Pred,
           class _Proj1,
           class _Proj2,
-          __enable_if_t<is_integral<_Tp>::value && __desugars_to<__equal_tag, _Pred, _Tp, _Tp>::value &&
+          __enable_if_t<is_integral<_Tp>::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> &&
                             __is_identity<_Proj1>::value && __is_identity<_Proj2>::value,
                         int> = 0>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*>
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index 14a0d76741d4..376abd39fa36 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -14,9 +14,9 @@
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__numeric/transform_reduce.h>
+#include <__type_traits/desugars_to.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__type_traits/operation_traits.h>
 #include <__utility/move.h>
 #include <new>
 #include <optional>
@@ -37,7 +37,7 @@ template <typename _DifferenceType,
           typename _BinaryOperation,
           typename _UnaryOperation,
           typename _UnaryResult = invoke_result_t<_UnaryOperation, _DifferenceType>,
-          __enable_if_t<__desugars_to<__plus_tag, _BinaryOperation, _Tp, _UnaryResult>::value && is_arithmetic_v<_Tp> &&
+          __enable_if_t<__desugars_to_v<__plus_tag, _BinaryOperation, _Tp, _UnaryResult> && is_arithmetic_v<_Tp> &&
                             is_arithmetic_v<_UnaryResult>,
                         int>    = 0>
 _LIBCPP_HIDE_FROM_ABI _Tp
@@ -53,8 +53,8 @@ template <typename _Size,
           typename _BinaryOperation,
           typename _UnaryOperation,
           typename _UnaryResult = invoke_result_t<_UnaryOperation, _Size>,
-          __enable_if_t<!(__desugars_to<__plus_tag, _BinaryOperation, _Tp, _UnaryResult>::value &&
-                          is_arithmetic_v<_Tp> && is_arithmetic_v<_UnaryResult>),
+          __enable_if_t<!(__desugars_to_v<__plus_tag, _BinaryOperation, _Tp, _UnaryResult> && is_arithmetic_v<_Tp> &&
+                          is_arithmetic_v<_UnaryResult>),
                         int>    = 0>
 _LIBCPP_HIDE_FROM_ABI _Tp
 __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __f) noexcept {
diff --git a/libcxx/include/__chrono/tzdb.h b/libcxx/include/__chrono/tzdb.h
index 45c20f279f9c..e0bfedf0d782 100644
--- a/libcxx/include/__chrono/tzdb.h
+++ b/libcxx/include/__chrono/tzdb.h
@@ -16,6 +16,7 @@
 // Enable the contents of the header only when libc++ was built with experimental features enabled.
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
 
+#  include <__algorithm/ranges_lower_bound.h>
 #  include <__chrono/leap_second.h>
 #  include <__chrono/time_zone.h>
 #  include <__chrono/time_zone_link.h>
@@ -43,6 +44,40 @@ struct tzdb {
   vector<time_zone_link> links;
 
   vector<leap_second> leap_seconds;
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI const time_zone* __locate_zone(string_view __name) const {
+    if (const time_zone* __result = __find_in_zone(__name))
+      return __result;
+
+    if (auto __it = ranges::lower_bound(links, __name, {}, &time_zone_link::name);
+        __it != links.end() && __it->name() == __name)
+      if (const time_zone* __result = __find_in_zone(__it->target()))
+        return __result;
+
+    return nullptr;
+  }
+
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI const time_zone* locate_zone(string_view __name) const {
+    if (const time_zone* __result = __locate_zone(__name))
+      return __result;
+
+    std::__throw_runtime_error("tzdb: requested time zone not found");
+  }
+
+  _LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI const time_zone* current_zone() const {
+    return __current_zone();
+  }
+
+private:
+  _LIBCPP_HIDE_FROM_ABI const time_zone* __find_in_zone(string_view __name) const noexcept {
+    if (auto __it = ranges::lower_bound(zones, __name, {}, &time_zone::name);
+        __it != zones.end() && __it->name() == __name)
+      return std::addressof(*__it);
+
+    return nullptr;
+  }
+
+  [[nodiscard]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI const time_zone* __current_zone() const;
 };
 
 } // namespace chrono
diff --git a/libcxx/include/__chrono/tzdb_list.h b/libcxx/include/__chrono/tzdb_list.h
index e8aaf31e3631..693899d37211 100644
--- a/libcxx/include/__chrono/tzdb_list.h
+++ b/libcxx/include/__chrono/tzdb_list.h
@@ -17,6 +17,7 @@
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
 
 #  include <__availability>
+#  include <__chrono/time_zone.h>
 #  include <__chrono/tzdb.h>
 #  include <__config>
 #  include <__fwd/string.h>
@@ -84,6 +85,15 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI inline con
   return get_tzdb_list().front();
 }
 
+_LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI inline const time_zone*
+locate_zone(string_view __name) {
+  return get_tzdb().locate_zone(__name);
+}
+
+_LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI inline const time_zone* current_zone() {
+  return get_tzdb().current_zone();
+}
+
 _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI const tzdb& reload_tzdb();
 
 _LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI string remote_version();
diff --git a/libcxx/include/__functional/operations.h b/libcxx/include/__functional/operations.h
index 7ddc00650f16..9aa28e492506 100644
--- a/libcxx/include/__functional/operations.h
+++ b/libcxx/include/__functional/operations.h
@@ -13,8 +13,7 @@
 #include <__config>
 #include <__functional/binary_function.h>
 #include <__functional/unary_function.h>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/operation_traits.h>
+#include <__type_traits/desugars_to.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -41,10 +40,10 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(plus);
 // The non-transparent std::plus specialization is only equivalent to a raw plus
 // operator when we don't perform an implicit conversion when calling it.
 template <class _Tp>
-struct __desugars_to<__plus_tag, plus<_Tp>, _Tp, _Tp> : true_type {};
+inline const bool __desugars_to_v<__plus_tag, plus<_Tp>, _Tp, _Tp> = true;
 
 template <class _Tp, class _Up>
-struct __desugars_to<__plus_tag, plus<void>, _Tp, _Up> : true_type {};
+inline const bool __desugars_to_v<__plus_tag, plus<void>, _Tp, _Up> = true;
 
 #if _LIBCPP_STD_VER >= 14
 template <>
@@ -315,11 +314,11 @@ struct _LIBCPP_TEMPLATE_VIS equal_to<void> {
 // The non-transparent std::equal_to specialization is only equivalent to a raw equality
 // comparison when we don't perform an implicit conversion when calling it.
 template <class _Tp>
-struct __desugars_to<__equal_tag, equal_to<_Tp>, _Tp, _Tp> : true_type {};
+inline const bool __desugars_to_v<__equal_tag, equal_to<_Tp>, _Tp, _Tp> = true;
 
 // In the transparent case, we do not enforce that
 template <class _Tp, class _Up>
-struct __desugars_to<__equal_tag, equal_to<void>, _Tp, _Up> : true_type {};
+inline const bool __desugars_to_v<__equal_tag, equal_to<void>, _Tp, _Up> = true;
 
 #if _LIBCPP_STD_VER >= 14
 template <class _Tp = void>
diff --git a/libcxx/include/__functional/ranges_operations.h b/libcxx/include/__functional/ranges_operations.h
index 38b28018049e..a9dffaf69625 100644
--- a/libcxx/include/__functional/ranges_operations.h
+++ b/libcxx/include/__functional/ranges_operations.h
@@ -13,8 +13,7 @@
 #include <__concepts/equality_comparable.h>
 #include <__concepts/totally_ordered.h>
 #include <__config>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/operation_traits.h>
+#include <__type_traits/desugars_to.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -98,7 +97,7 @@ struct greater_equal {
 // For ranges we do not require that the types on each side of the equality
 // operator are of the same type
 template <class _Tp, class _Up>
-struct __desugars_to<__equal_tag, ranges::equal_to, _Tp, _Up> : true_type {};
+inline const bool __desugars_to_v<__equal_tag, ranges::equal_to, _Tp, _Up> = true;
 
 #endif // _LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__numeric/pstl_transform_reduce.h b/libcxx/include/__numeric/pstl_transform_reduce.h
index 2f412d41f7f2..07ecf0d9956b 100644
--- a/libcxx/include/__numeric/pstl_transform_reduce.h
+++ b/libcxx/include/__numeric/pstl_transform_reduce.h
@@ -87,7 +87,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp transform_reduce(
 }
 
 // This overload doesn't get a customization point because it's trivial to detect (through e.g.
-// __desugars_to) when specializing the more general variant, which should always be preferred
+// __desugars_to_v) when specializing the more general variant, which should always be preferred
 template <class _ExecutionPolicy,
           class _ForwardIterator1,
           class _ForwardIterator2,
diff --git a/libcxx/include/__type_traits/operation_traits.h b/libcxx/include/__type_traits/desugars_to.h
index ef6e71693430..a8f69c28dfc5 100644
--- a/libcxx/include/__type_traits/operation_traits.h
+++ b/libcxx/include/__type_traits/desugars_to.h
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___TYPE_TRAITS_OPERATION_TRAITS_H
-#define _LIBCPP___TYPE_TRAITS_OPERATION_TRAITS_H
+#ifndef _LIBCPP___TYPE_TRAITS_DESUGARS_TO_H
+#define _LIBCPP___TYPE_TRAITS_DESUGARS_TO_H
 
 #include <__config>
-#include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -33,8 +32,8 @@ struct __plus_tag {};
 // predicate being passed is actually going to call a builtin operator, or has
 // some specific semantics.
 template <class _CanonicalTag, class _Operation, class... _Args>
-struct __desugars_to : false_type {};
+inline const bool __desugars_to_v = false;
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP___TYPE_TRAITS_OPERATION_TRAITS_H
+#endif // _LIBCPP___TYPE_TRAITS_DESUGARS_TO_H
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index 4dd43137b718..8fdc30a3624d 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -689,6 +689,9 @@ struct tzdb {
   vector<time_zone>      zones;
   vector<time_zone_link> links;
   vector<leap_second>    leap_seconds;
+
+  const time_zone* locate_zone(string_view tz_name) const;
+  const time_zone* current_zone() const;
 };
 
 class tzdb_list {                                                                // C++20
@@ -714,6 +717,8 @@ public:
 // [time.zone.db.access], time zone database access
 const tzdb& get_tzdb();                                                          // C++20
 tzdb_list& get_tzdb_list();                                                      // C++20
+const time_zone* locate_zone(string_view tz_name);                               // C++20
+const time_zone* current_zone()                                                  // C++20
 
 // [time.zone.db.remote], remote time zone database support
 const tzdb& reload_tzdb();                                                       // C++20
diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
index 2cb1fa5e1e2a..607f63e6d822 100644
--- a/libcxx/include/libcxx.imp
+++ b/libcxx/include/libcxx.imp
@@ -734,6 +734,7 @@
   { include: [ "<__type_traits/datasizeof.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/decay.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/dependent_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/desugars_to.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/disjunction.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/enable_if.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/extent.h>", "private", "<type_traits>", "public" ] },
@@ -818,7 +819,6 @@
   { include: [ "<__type_traits/nat.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/negation.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/noexcept_move_assign_container.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/operation_traits.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/promote.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/rank.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/remove_all_extents.h>", "private", "<type_traits>", "public" ] },
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 6d4dcc2511f3..ed45a1b18338 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1867,6 +1867,7 @@ module std_private_type_traits_decay                                     [system
   export std_private_type_traits_add_pointer
 }
 module std_private_type_traits_dependent_type                            [system] { header "__type_traits/dependent_type.h" }
+module std_private_type_traits_desugars_to                               [system] { header "__type_traits/desugars_to.h" }
 module std_private_type_traits_disjunction                               [system] { header "__type_traits/disjunction.h" }
 module std_private_type_traits_enable_if                                 [system] { header "__type_traits/enable_if.h" }
 module std_private_type_traits_extent                                    [system] { header "__type_traits/extent.h" }
@@ -2017,7 +2018,6 @@ module std_private_type_traits_maybe_const                               [system
 module std_private_type_traits_nat                                       [system] { header "__type_traits/nat.h" }
 module std_private_type_traits_negation                                  [system] { header "__type_traits/negation.h" }
 module std_private_type_traits_noexcept_move_assign_container            [system] { header "__type_traits/noexcept_move_assign_container.h" }
-module std_private_type_traits_operation_traits                          [system] { header "__type_traits/operation_traits.h" }
 module std_private_type_traits_promote                                   [system] { header "__type_traits/promote.h" }
 module std_private_type_traits_rank                                      [system] { header "__type_traits/rank.h" }
 module std_private_type_traits_remove_all_extents                        [system] { header "__type_traits/remove_all_extents.h" }
diff --git a/libcxx/include/strstream b/libcxx/include/strstream
index e9f533644f78..c8df6eb0da03 100644
--- a/libcxx/include/strstream
+++ b/libcxx/include/strstream
@@ -13,7 +13,7 @@
 /*
     strstream synopsis
 
-class strstreambuf
+class strstreambuf                                    // Removed in C++26
     : public basic_streambuf<char>
 {
 public:
@@ -63,7 +63,7 @@ private:
     void (*pfree)(void*);               // exposition only
 };
 
-class istrstream
+class istrstream                                      // Removed in C++26
     : public basic_istream<char>
 {
 public:
@@ -81,7 +81,7 @@ private:
     strstreambuf sb; // exposition only
 };
 
-class ostrstream
+class ostrstream                                      // Removed in C++26
     : public basic_ostream<char>
 {
 public:
@@ -99,7 +99,7 @@ private:
     strstreambuf sb; // exposition only
 };
 
-class strstream
+class strstream                                       // Removed in C++26
     : public basic_iostream<char>
 {
 public:
@@ -138,6 +138,8 @@ private:
 #  pragma GCC system_header
 #endif
 
+#if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM) || defined(_LIBCPP_BUILDING_LIBRARY)
+
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
@@ -344,4 +346,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
+#endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM) || defined(_LIBCPP_BUILDING_LIBRARY)
+
 #endif // _LIBCPP_STRSTREAM
diff --git a/libcxx/modules/std/chrono.inc b/libcxx/modules/std/chrono.inc
index 2c0bd3f98a67..e14228043d3b 100644
--- a/libcxx/modules/std/chrono.inc
+++ b/libcxx/modules/std/chrono.inc
@@ -199,10 +199,10 @@ export namespace std {
     using std::chrono::tzdb_list;
 
     // [time.zone.db.access], time zone database access
-    // using std::chrono::current_zone;
+    using std::chrono::current_zone;
     using std::chrono::get_tzdb;
     using std::chrono::get_tzdb_list;
-    // using std::chrono::locate_zone;
+    using std::chrono::locate_zone;
 
     // [time.zone.db.remote], remote time zone database support
     using std::chrono::reload_tzdb;
diff --git a/libcxx/modules/std/strstream.inc b/libcxx/modules/std/strstream.inc
index a33c514f1502..808796701aba 100644
--- a/libcxx/modules/std/strstream.inc
+++ b/libcxx/modules/std/strstream.inc
@@ -9,9 +9,11 @@
 
 export namespace std {
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
+#  if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM)
   using std::istrstream;
   using std::ostrstream;
   using std::strstream;
   using std::strstreambuf;
+#  endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM)
 #endif // _LIBCPP_HAS_NO_LOCALIZATION
 } // namespace std
diff --git a/libcxx/src/tzdb.cpp b/libcxx/src/tzdb.cpp
index 7ba5ceb7ada3..2c82a4a4317a 100644
--- a/libcxx/src/tzdb.cpp
+++ b/libcxx/src/tzdb.cpp
@@ -675,6 +675,57 @@ void __init_tzdb(tzdb& __tzdb, __tz::__rules_storage_type& __rules) {
   std::ranges::sort(__tzdb.leap_seconds);
 }
 
+#ifdef _WIN32
+[[nodiscard]] static const time_zone* __current_zone_windows(const tzdb& tzdb) {
+  // TODO TZDB Implement this on Windows.
+  std::__throw_runtime_error("unknown time zone");
+}
+#else  // ifdef _WIN32
+[[nodiscard]] static const time_zone* __current_zone_posix(const tzdb& tzdb) {
+  // On POSIX systems there are several ways to configure the time zone.
+  // In order of priority they are:
+  // - TZ environment variable
+  //   https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap08.html#tag_08
+  //   The documentation is unclear whether or not it's allowed to
+  //   change time zone information. For example the TZ string
+  //     MST7MDT
+  //   this is an entry in tzdata.zi. The value
+  //     MST
+  //   is also an entry. Is it allowed to use the following?
+  //     MST-3
+  //   Even when this is valid there is no time_zone record in the
+  //   database. Since the library would need to return a valid pointer,
+  //   this means the library needs to allocate and leak a pointer.
+  //
+  // - The time zone name is the target of the symlink /etc/localtime
+  //   relative to /usr/share/zoneinfo/
+
+  // The algorithm is like this:
+  // - If the environment variable TZ is set and points to a valid
+  //   record use this value.
+  // - Else use the name based on the `/etc/localtime` symlink.
+
+  if (const char* __tz = getenv("TZ"))
+    if (const time_zone* __result = tzdb.__locate_zone(__tz))
+      return __result;
+
+  filesystem::path __path = "/etc/localtime";
+  if (!std::filesystem::exists(__path))
+    std::__throw_runtime_error("tzdb: the symlink '/etc/localtime' does not exist");
+
+  if (!std::filesystem::is_symlink(__path))
+    std::__throw_runtime_error("tzdb: the path '/etc/localtime' is not a symlink");
+
+  filesystem::path __tz = filesystem::read_symlink(__path);
+  string __name         = filesystem::relative(__tz, "/usr/share/zoneinfo/");
+
+  if (const time_zone* __result = tzdb.__locate_zone(__name))
+    return __result;
+
+  std::__throw_runtime_error(("tzdb: the time zone '" + __name + "' is not found in the database").c_str());
+}
+#endif // ifdef _WIN32
+
 //===----------------------------------------------------------------------===//
 //                           Public API
 //===----------------------------------------------------------------------===//
@@ -684,6 +735,14 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI tzdb_l
   return __result;
 }
 
+[[nodiscard]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI const time_zone* tzdb::__current_zone() const {
+#ifdef _WIN32
+  return chrono::__current_zone_windows(*this);
+#else
+  return chrono::__current_zone_posix(*this);
+#endif
+}
+
 _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI const tzdb& reload_tzdb() {
   if (chrono::remote_version() == chrono::get_tzdb().version)
     return chrono::get_tzdb();
diff --git a/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp b/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
index defd43cf267a..2790916edaf6 100644
--- a/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
+++ b/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
@@ -9,7 +9,7 @@
 // This test fails with Clang <18 because diagnose_if doesn't emit all of the
 // diagnostics when -fdelayed-template-parsing is enabled, like it is in MSVC
 // mode.
-// XFAIL: msvc && (clang-16 || clang-17)
+// XFAIL: msvc && clang-17
 
 // REQUIRES: diagnose-if-support
 
diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp
index c868832ea74a..9acb57fa05f7 100644
--- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp
+++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp
@@ -38,9 +38,17 @@ void test() {
 
   std::chrono::get_tzdb_list();
   std::chrono::get_tzdb();
+  std::chrono::locate_zone("name");
+  std::chrono::current_zone();
   std::chrono::remote_version();
 
   {
+    const std::chrono::tzdb& t = list.front();
+    t.locate_zone("name");
+    t.current_zone();
+  }
+
+  {
     tz.name();
     operator==(tz, tz);
     operator<=>(tz, tz);
diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp
index 4d26b46a89c9..8795a4eb3c6c 100644
--- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp
@@ -33,9 +33,17 @@ void test() {
   list.cbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   list.cend();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 
+  {
+    const std::chrono::tzdb& t = list.front();
+    t.locate_zone("name"); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    t.current_zone();      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  }
+
   namespace crno = std::chrono;
   crno::get_tzdb_list();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   crno::get_tzdb();       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  crno::locate_zone("n"); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  crno::current_zone();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   crno::remote_version(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 
   {
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
index b411ce198e25..a0bfb7c4a246 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: msvc && (clang-16 || clang-17)
+// XFAIL: msvc && clang-17
 
 // class lazy_split_view {
 //   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
index 0d8bfbc0316d..694cf1fd0d0e 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: msvc && (clang-16 || clang-17)
+// XFAIL: msvc && clang-17
 
 // class split_view {
 //   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
diff --git a/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
index 8359d267245f..a77c4e4d1bcd 100644
--- a/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
@@ -8,7 +8,7 @@
 
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: msvc && (clang-16 || clang-17)
+// XFAIL: msvc && clang-17
 
 // Test the libc++ extension that the value stored in `std::ranges::istream_view` has been marked
 // as _LIBCPP_NO_UNIQUE_ADDRESS
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
new file mode 100644
index 000000000000..971f7f04c49a
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// struct tzdb
+
+// const time_zone* locate_zone(string_view tz_name) const;
+
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+#include "filesystem_test_helper.h"
+#include "test_tzdb.h"
+
+scoped_test_env env;
+[[maybe_unused]] const std::filesystem::path dir = env.create_dir("zoneinfo");
+const std::filesystem::path file                 = env.create_file("zoneinfo/tzdata.zi");
+
+std::string_view std::chrono::__libcpp_tzdb_directory() {
+  static std::string result = dir.string();
+  return result;
+}
+
+void write(std::string_view input) {
+  static int version = 0;
+
+  std::ofstream f{file};
+  f << "# version " << version++ << '\n';
+  f.write(input.data(), input.size());
+}
+
+static const std::chrono::tzdb& parse(std::string_view input) {
+  write(input);
+  return std::chrono::reload_tzdb();
+}
+
+int main(int, const char**) {
+  const std::chrono::tzdb& tzdb = parse(
+      R"(
+Z zone 0 r f
+L zone link
+L link link_to_link
+)");
+
+  {
+    const std::chrono::time_zone* tz = tzdb.locate_zone("zone");
+    assert(tz);
+    assert(tz->name() == "zone");
+  }
+  {
+    const std::chrono::time_zone* tz = tzdb.locate_zone("link");
+    assert(tz);
+    assert(tz->name() == "zone");
+  }
+
+  TEST_VALIDATE_EXCEPTION(
+      std::runtime_error,
+      [&]([[maybe_unused]] const std::runtime_error& e) {
+        std::string_view what{"tzdb: requested time zone not found"};
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
+      },
+      TEST_IGNORE_NODISCARD tzdb.locate_zone("link_to_link"));
+
+  return 0;
+}
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp
index b5ee0bfbecf0..701e6dfde57a 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp
index 4d0d6731338a..fd2ad66e108b 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp
index 58980949732d..c5fe349f24d7 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp
index e13e20e20f8c..7d9c7d6c4200 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp
index 449114a0f6a9..f5ef29bcdbe5 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp
index e7c063750fb6..63d3800290e1 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp
index 2ab252e93494..d02f12d9ebb1 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
+
 // <strstream>
 
 // check that istrstream is marked deprecated
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp
index be1a9e1251ba..526c4dc8b41d 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp
index 8698983a7ebc..b9db1bf444ed 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp
index abbf6af7e11e..b67f0ab16e92 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp
index 854e68b17249..e087c0686f09 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp
index 9830aeb6fc8c..73f20335d03d 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp
index f9a859dd8dab..d8b55d90315e 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp
index 72f665af5851..28670315339e 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp
index e0c805fee107..9ec46509faad 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
+
 // <strstream>
 
 // check that ostrstream is marked deprecated
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp
index 6a71c44a90cf..e321444924fd 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp
index a85e1328b351..54e09d3f30db 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp
index 390162ef0f18..1e4c1209f711 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp
index 3fe277a48cbb..99d58cc57351 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp
index 263fddef6c34..6cc26bba7d02 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp
index b053cf128ab4..efe2c324eb31 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp
index 3d251d9a99cd..e1bc40c86eaf 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp
index 03655226a54b..ab88d6d49a11 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
+
 // <strstream>
 
 // check that strstream is marked deprecated
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp
index fb543841e8f3..76094302a0cd 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp
index 8f81707973d8..3d7f2ffcfead 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp
index 25a9617dcdaf..0c4bf6214f7f 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp
index fc3386ff5575..e09288354472 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp
index a74c504fd278..7f5504dac1b1 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp
index 756427df4820..0aa7e1afcb17 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp
index 81924c995b15..803be42fe756 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp
index b8991a8fc433..35c1512e0a1d 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp
index 1d3463f3cac1..e71fa031ebcb 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp
index 93eec8dde397..1276187d7f3e 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp
index 5b973cff0ce0..fc79e787d38f 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp
index b64c9dcb4447..b62c33924188 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp
index d6c8b8e7e11b..68be8ddec035 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp
index 37109c7e942c..6a932dc00afa 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp
index 698953f7121a..484a726073aa 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp
index d98e6f73f7aa..96900d89e402 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp
index be88f5aecc5c..f3193d352049 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp
index ce7612bc66a9..44e6704dccb4 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp
index 4fc79b575c7a..be916be479e2 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp
index a598acbba8c8..471b287e7f8a 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
+
 // <strstream>
 
 // check that strstreambuf is marked deprecated
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp
index bc312cbbb225..aee126056c0e 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
new file mode 100644
index 000000000000..d85c8ba52622
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// const time_zone* current_zone();
+
+#include <cassert>
+#include <chrono>
+#include <string_view>
+#include <stdlib.h>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+#ifdef _WIN32
+static void set_tz(std::string zone) {
+  // Note Windows does not have setenv, only putenv
+  // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/putenv-s-wputenv-s?view=msvc-170
+  // Unlike POSIX it does not mention the string of putenv becomes part
+  // of the environment.
+
+  int status = _putenv_s("TZ", zone.c_str(), 1);
+  assert(status == 0);
+}
+
+#else
+static void set_tz(const std::string& zone) {
+  int status = setenv("TZ", zone.c_str(), 1);
+  assert(status == 0);
+}
+#endif
+
+static void test_zone(const std::string& zone) {
+  set_tz(zone);
+  const std::chrono::time_zone* tz = std::chrono::current_zone();
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_link(const std::string& link, std::string_view zone) {
+  set_tz(link);
+  const std::chrono::time_zone* tz = std::chrono::current_zone();
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+int main(int, const char**) {
+  const std::chrono::time_zone* tz = std::chrono::current_zone();
+  // Returns a valid time zone, the value depends on the OS settings.
+  assert(tz);
+  // setting the environment to an invalid value returns the value of
+  // the OS setting.
+  set_tz("This is not a time zone");
+  assert(tz == std::chrono::current_zone());
+
+  const std::chrono::tzdb& db = std::chrono::get_tzdb();
+  for (const auto& zone : db.zones)
+    test_zone(std::string{zone.name()});
+
+  for (const auto& link : db.links)
+    test_link(std::string{link.name()}, link.target());
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
new file mode 100644
index 000000000000..c3142a86bf9d
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// const time_zone* locate_zone(string_view tz_name);
+
+#include <cassert>
+#include <chrono>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+static void test_zone(std::string_view zone) {
+  const std::chrono::time_zone* tz = std::chrono::locate_zone(zone);
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_link(std::string_view link, std::string_view zone) {
+  const std::chrono::time_zone* tz = std::chrono::locate_zone(link);
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_exception([[maybe_unused]] std::string_view zone) {
+  TEST_VALIDATE_EXCEPTION(
+      std::runtime_error,
+      [&]([[maybe_unused]] const std::runtime_error& e) {
+        std::string_view what{"tzdb: requested time zone not found"};
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
+      },
+      TEST_IGNORE_NODISCARD std::chrono::locate_zone(zone));
+}
+
+int main(int, const char**) {
+  const std::chrono::tzdb& db = std::chrono::get_tzdb();
+  for (const auto& zone : db.zones)
+    test_zone(zone.name());
+
+  for (const auto& link : db.links)
+    test_link(link.name(), link.target());
+
+  test_exception("This is not a time zone");
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
new file mode 100644
index 000000000000..7b4218cc8421
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// struct tzdb
+
+// const time_zone* current_zone() const;
+
+#include <cassert>
+#include <chrono>
+#include <string_view>
+#include <stdlib.h>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+#ifdef _WIN32
+static void set_tz(std::string zone) {
+  // Note Windows does not have setenv, only putenv
+  // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/putenv-s-wputenv-s?view=msvc-170
+  // Unlike POSIX it does not mention the string of putenv becomes part
+  // of the environment.
+
+  int status = _putenv_s("TZ", zone.c_str(), 1);
+  assert(status == 0);
+}
+
+#else
+static void set_tz(const std::string& zone) {
+  int status = setenv("TZ", zone.c_str(), 1);
+  assert(status == 0);
+}
+#endif
+
+static void test_zone(const std::string& zone) {
+  set_tz(zone);
+  const std::chrono::time_zone* tz = std::chrono::get_tzdb().current_zone();
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_link(const std::string& link, std::string_view zone) {
+  set_tz(link);
+  const std::chrono::time_zone* tz = std::chrono::get_tzdb().current_zone();
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+int main(int, const char**) {
+  const std::chrono::time_zone* tz = std::chrono::get_tzdb().current_zone();
+  // Returns a valid time zone, the value depends on the OS settings.
+  assert(tz);
+  // setting the environment to an invalid value returns the value of
+  // the OS setting.
+  set_tz("This is not a time zone");
+  assert(tz == std::chrono::get_tzdb().current_zone());
+
+  const std::chrono::tzdb& db = std::chrono::get_tzdb();
+  for (const auto& zone : db.zones)
+    test_zone(std::string{zone.name()});
+
+  for (const auto& link : db.links)
+    test_link(std::string{link.name()}, link.target());
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
new file mode 100644
index 000000000000..12987f6c89d8
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// struct tzdb
+
+// const time_zone* locate_zone(string_view tz_name) const;
+
+#include <cassert>
+#include <chrono>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+static void test_zone(std::string_view zone) {
+  const std::chrono::time_zone* tz = std::chrono::get_tzdb().locate_zone(zone);
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_link(std::string_view link, std::string_view zone) {
+  const std::chrono::time_zone* tz = std::chrono::get_tzdb().locate_zone(link);
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_exception([[maybe_unused]] std::string_view zone) {
+  TEST_VALIDATE_EXCEPTION(
+      std::runtime_error,
+      [&]([[maybe_unused]] const std::runtime_error& e) {
+        std::string_view what{"tzdb: requested time zone not found"};
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
+      },
+      TEST_IGNORE_NODISCARD std::chrono::get_tzdb().locate_zone(zone));
+}
+
+int main(int, const char**) {
+  const std::chrono::tzdb& db = std::chrono::get_tzdb();
+  for (const auto& zone : db.zones)
+    test_zone(zone.name());
+
+  for (const auto& link : db.links)
+    test_link(link.name(), link.target());
+
+  test_exception("This is not a time zone");
+
+  return 0;
+}
diff --git a/libcxx/utils/ci/oss-fuzz.sh b/libcxx/utils/ci/oss-fuzz.sh
index e5723406a9ff..03b59b294041 100755
--- a/libcxx/utils/ci/oss-fuzz.sh
+++ b/libcxx/utils/ci/oss-fuzz.sh
@@ -23,7 +23,7 @@ for test in libcxx/test/libcxx/fuzzing/*.pass.cpp; do
     exe="$(basename ${test})"
     exe="${exe%.pass.cpp}"
     ${CXX} ${CXXFLAGS} \
-        -std=c++14 \
+        -std=c++20 \
         -DLIBCPP_OSS_FUZZ \
         -D_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS \
         -nostdinc++ -cxx-isystem ${INSTALL}/include/c++/v1 \
diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 39f4575031be..004d71097387 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -437,19 +437,17 @@ void SectionChunk::applyRelocation(uint8_t *off,
   // Compute the RVA of the relocation for relative relocations.
   uint64_t p = rva + rel.VirtualAddress;
   uint64_t imageBase = file->ctx.config.imageBase;
-  switch (getMachine()) {
-  case AMD64:
+  switch (getArch()) {
+  case Triple::x86_64:
     applyRelX64(off, rel.Type, os, s, p, imageBase);
     break;
-  case I386:
+  case Triple::x86:
     applyRelX86(off, rel.Type, os, s, p, imageBase);
     break;
-  case ARMNT:
+  case Triple::thumb:
     applyRelARM(off, rel.Type, os, s, p, imageBase);
     break;
-  case ARM64:
-  case ARM64EC:
-  case ARM64X:
+  case Triple::aarch64:
     applyRelARM64(off, rel.Type, os, s, p, imageBase);
     break;
   default:
@@ -516,27 +514,25 @@ void SectionChunk::addAssociative(SectionChunk *child) {
 }
 
 static uint8_t getBaserelType(const coff_relocation &rel,
-                              llvm::COFF::MachineTypes machine) {
-  switch (machine) {
-  case AMD64:
+                              Triple::ArchType arch) {
+  switch (arch) {
+  case Triple::x86_64:
     if (rel.Type == IMAGE_REL_AMD64_ADDR64)
       return IMAGE_REL_BASED_DIR64;
     if (rel.Type == IMAGE_REL_AMD64_ADDR32)
       return IMAGE_REL_BASED_HIGHLOW;
     return IMAGE_REL_BASED_ABSOLUTE;
-  case I386:
+  case Triple::x86:
     if (rel.Type == IMAGE_REL_I386_DIR32)
       return IMAGE_REL_BASED_HIGHLOW;
     return IMAGE_REL_BASED_ABSOLUTE;
-  case ARMNT:
+  case Triple::thumb:
     if (rel.Type == IMAGE_REL_ARM_ADDR32)
       return IMAGE_REL_BASED_HIGHLOW;
     if (rel.Type == IMAGE_REL_ARM_MOV32T)
       return IMAGE_REL_BASED_ARM_MOV32T;
     return IMAGE_REL_BASED_ABSOLUTE;
-  case ARM64:
-  case ARM64EC:
-  case ARM64X:
+  case Triple::aarch64:
     if (rel.Type == IMAGE_REL_ARM64_ADDR64)
       return IMAGE_REL_BASED_DIR64;
     return IMAGE_REL_BASED_ABSOLUTE;
@@ -551,7 +547,7 @@ static uint8_t getBaserelType(const coff_relocation &rel,
 // Only called when base relocation is enabled.
 void SectionChunk::getBaserels(std::vector<Baserel> *res) {
   for (const coff_relocation &rel : getRelocs()) {
-    uint8_t ty = getBaserelType(rel, getMachine());
+    uint8_t ty = getBaserelType(rel, getArch());
     if (ty == IMAGE_REL_BASED_ABSOLUTE)
       continue;
     Symbol *target = file->getSymbol(rel.SymbolTableIndex);
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 7b6bdeae4234..bb919037ecc2 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/WindowsMachineFlag.h"
 #include <utility>
 #include <vector>
 
@@ -116,6 +117,7 @@ public:
   bool isHotPatchable() const;
 
   MachineTypes getMachine() const;
+  llvm::Triple::ArchType getArch() const;
   std::optional<chpe_range_type> getArm64ECRangeType() const;
 
 protected:
@@ -437,6 +439,10 @@ inline MachineTypes Chunk::getMachine() const {
   return static_cast<const NonSectionChunk *>(this)->getMachine();
 }
 
+inline llvm::Triple::ArchType Chunk::getArch() const {
+  return llvm::getMachineArchType(getMachine());
+}
+
 inline std::optional<chpe_range_type> Chunk::getArm64ECRangeType() const {
   // Data sections don't need codemap entries.
   if (!(getOutputCharacteristics() & llvm::COFF::IMAGE_SCN_MEM_EXECUTE))
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 2b1d4abb6ed0..b0365b5b9441 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -31,7 +31,6 @@
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/COFFModuleDefinition.h"
-#include "llvm/Object/WindowsMachineFlag.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
@@ -157,18 +156,7 @@ StringRef LinkerDriver::mangle(StringRef sym) {
 }
 
 llvm::Triple::ArchType LinkerDriver::getArch() {
-  switch (ctx.config.machine) {
-  case I386:
-    return llvm::Triple::ArchType::x86;
-  case AMD64:
-    return llvm::Triple::ArchType::x86_64;
-  case ARMNT:
-    return llvm::Triple::ArchType::arm;
-  case ARM64:
-    return llvm::Triple::ArchType::aarch64;
-  default:
-    return llvm::Triple::ArchType::UnknownArch;
-  }
+  return getMachineArchType(ctx.config.machine);
 }
 
 bool LinkerDriver::findUnderscoreMangle(StringRef sym) {
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 44aa506d2c35..3accf24663c6 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -19,7 +19,6 @@
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/LTO/LTO.h"
-#include "llvm/Object/WindowsMachineFlag.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <utility>
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 30ccd68f7b75..017c17c2b03d 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -113,6 +113,8 @@ RelExpr AArch64::getRelExpr(RelType type, const Symbol &s,
   case R_AARCH64_MOVW_UABS_G2_NC:
   case R_AARCH64_MOVW_UABS_G3:
     return R_ABS;
+  case R_AARCH64_AUTH_ABS64:
+    return R_AARCH64_AUTH;
   case R_AARCH64_TLSDESC_ADR_PAGE21:
     return R_AARCH64_TLSDESC_PAGE;
   case R_AARCH64_TLSDESC_LD64_LO12:
@@ -204,7 +206,7 @@ bool AArch64::usesOnlyLowPageBits(RelType type) const {
 }
 
 RelType AArch64::getDynRel(RelType type) const {
-  if (type == R_AARCH64_ABS64)
+  if (type == R_AARCH64_ABS64 || type == R_AARCH64_AUTH_ABS64)
     return type;
   return R_AARCH64_NONE;
 }
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 27274d69b7f1..83f293ab2ce5 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -187,6 +187,7 @@ struct Config {
   llvm::StringRef cmseOutputLib;
   StringRef zBtiReport = "none";
   StringRef zCetReport = "none";
+  StringRef zPauthReport = "none";
   bool ltoBBAddrMap;
   llvm::StringRef ltoBasicBlockSections;
   std::pair<llvm::StringRef, llvm::StringRef> thinLTOObjectSuffixReplace;
@@ -499,6 +500,8 @@ struct Ctx {
   void reset();
 
   llvm::raw_fd_ostream openAuxiliaryFile(llvm::StringRef, std::error_code &);
+
+  ArrayRef<uint8_t> aarch64PauthAbiCoreInfo;
 };
 
 LLVM_LIBRARY_VISIBILITY extern Ctx ctx;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index b43da7727e22..86cc09621a91 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -46,6 +46,7 @@
 #include "lld/Common/Strings.h"
 #include "lld/Common/TargetOptionsCommandFlags.h"
 #include "lld/Common/Version.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -461,6 +462,8 @@ static void checkOptions() {
       error("-z force-bti only supported on AArch64");
     if (config->zBtiReport != "none")
       error("-z bti-report only supported on AArch64");
+    if (config->zPauthReport != "none")
+      error("-z pauth-report only supported on AArch64");
   }
 
   if (config->emachine != EM_386 && config->emachine != EM_X86_64 &&
@@ -1501,7 +1504,8 @@ static void readConfigs(opt::InputArgList &args) {
   }
 
   auto reports = {std::make_pair("bti-report", &config->zBtiReport),
-                  std::make_pair("cet-report", &config->zCetReport)};
+                  std::make_pair("cet-report", &config->zCetReport),
+                  std::make_pair("pauth-report", &config->zPauthReport)};
   for (opt::Arg *arg : args.filtered(OPT_z)) {
     std::pair<StringRef, StringRef> option =
         StringRef(arg->getValue()).split('=');
@@ -2599,14 +2603,17 @@ static void redirectSymbols(ArrayRef<WrappedSymbol> wrapped) {
     symtab.wrap(w.sym, w.real, w.wrap);
 }
 
+static void reportMissingFeature(StringRef config, const Twine &report) {
+  if (config == "error")
+    error(report);
+  else if (config == "warning")
+    warn(report);
+}
+
 static void checkAndReportMissingFeature(StringRef config, uint32_t features,
                                          uint32_t mask, const Twine &report) {
-  if (!(features & mask)) {
-    if (config == "error")
-      error(report);
-    else if (config == "warning")
-      warn(report);
-  }
+  if (!(features & mask))
+    reportMissingFeature(config, report);
 }
 
 // To enable CET (x86's hardware-assisted control flow enforcement), each
@@ -2617,12 +2624,28 @@ static void checkAndReportMissingFeature(StringRef config, uint32_t features,
 //
 // This is also the case with AARCH64's BTI and PAC which use the similar
 // GNU_PROPERTY_AARCH64_FEATURE_1_AND mechanism.
-static uint32_t getAndFeatures() {
+//
+// For AArch64 PAuth-enabled object files, the core info of all of them must
+// match. Missing info for some object files with matching info for remaining
+// ones can be allowed (see -z pauth-report).
+static void readSecurityNotes() {
   if (config->emachine != EM_386 && config->emachine != EM_X86_64 &&
       config->emachine != EM_AARCH64)
-    return 0;
+    return;
+
+  config->andFeatures = -1;
+
+  StringRef referenceFileName;
+  if (config->emachine == EM_AARCH64) {
+    auto it = llvm::find_if(ctx.objectFiles, [](const ELFFileBase *f) {
+      return !f->aarch64PauthAbiCoreInfo.empty();
+    });
+    if (it != ctx.objectFiles.end()) {
+      ctx.aarch64PauthAbiCoreInfo = (*it)->aarch64PauthAbiCoreInfo;
+      referenceFileName = (*it)->getName();
+    }
+  }
 
-  uint32_t ret = -1;
   for (ELFFileBase *f : ctx.objectFiles) {
     uint32_t features = f->andFeatures;
 
@@ -2658,14 +2681,31 @@ static uint32_t getAndFeatures() {
                          "GNU_PROPERTY_AARCH64_FEATURE_1_PAC property");
       features |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
     }
-    ret &= features;
+    config->andFeatures &= features;
+
+    if (ctx.aarch64PauthAbiCoreInfo.empty())
+      continue;
+
+    if (f->aarch64PauthAbiCoreInfo.empty()) {
+      reportMissingFeature(config->zPauthReport,
+                           toString(f) +
+                               ": -z pauth-report: file does not have AArch64 "
+                               "PAuth core info while '" +
+                               referenceFileName + "' has one");
+      continue;
+    }
+
+    if (ctx.aarch64PauthAbiCoreInfo != f->aarch64PauthAbiCoreInfo)
+      errorOrWarn("incompatible values of AArch64 PAuth core info found\n>>> " +
+                  referenceFileName + ": 0x" +
+                  toHex(ctx.aarch64PauthAbiCoreInfo, /*LowerCase=*/true) +
+                  "\n>>> " + toString(f) + ": 0x" +
+                  toHex(f->aarch64PauthAbiCoreInfo, /*LowerCase=*/true));
   }
 
   // Force enable Shadow Stack.
   if (config->zShstk)
-    ret |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
-
-  return ret;
+    config->andFeatures |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
 }
 
 static void initSectionsAndLocalSyms(ELFFileBase *file, bool ignoreComdats) {
@@ -2727,13 +2767,6 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   // Create dynamic sections for dynamic linking and static PIE.
   config->hasDynSymTab = !ctx.sharedFiles.empty() || config->isPic;
 
-  script->addScriptReferencedSymbolsToSymTable();
-
-  // Prevent LTO from removing any definition referenced by -u.
-  for (StringRef name : config->undefined)
-    if (Defined *sym = dyn_cast_or_null<Defined>(symtab.find(name)))
-      sym->isUsedInRegularObj = true;
-
   // If an entry symbol is in a static archive, pull out that file now.
   if (Symbol *sym = symtab.find(config->entry))
     handleUndefined(sym, "--entry");
@@ -2742,6 +2775,16 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   for (StringRef pat : args::getStrings(args, OPT_undefined_glob))
     handleUndefinedGlob(pat);
 
+  // After potential archive member extraction involving ENTRY and
+  // -u/--undefined-glob, check whether PROVIDE symbols should be defined (the
+  // RHS may refer to definitions in just extracted object files).
+  script->addScriptReferencedSymbolsToSymTable();
+
+  // Prevent LTO from removing any definition referenced by -u.
+  for (StringRef name : config->undefined)
+    if (Defined *sym = dyn_cast_or_null<Defined>(symtab.find(name)))
+      sym->isUsedInRegularObj = true;
+
   // Mark -init and -fini symbols so that the LTO doesn't eliminate them.
   if (Symbol *sym = dyn_cast_or_null<Defined>(symtab.find(config->init)))
     sym->isUsedInRegularObj = true;
@@ -2944,7 +2987,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
 
   // Read .note.gnu.property sections from input object files which
   // contain a hint to tweak linker's and loader's behaviors.
-  config->andFeatures = getAndFeatures();
+  readSecurityNotes();
 
   // The Target instance handles target-specific stuff, such as applying
   // relocations or writing a PLT section. It also contains target-dependent
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 6529ea072fae..1f496026d3ae 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -926,25 +926,18 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
     handleSectionGroup<ELFT>(this->sections, entries);
 }
 
-// If a source file is compiled with x86 hardware-assisted call flow control
-// enabled, the generated object file contains feature flags indicating that
-// fact. This function reads the feature flags and returns it.
-//
-// Essentially we want to read a single 32-bit value in this function, but this
-// function is rather complicated because the value is buried deep inside a
-// .note.gnu.property section.
-//
-// The section consists of one or more NOTE records. Each NOTE record consists
-// of zero or more type-length-value fields. We want to find a field of a
-// certain type. It seems a bit too much to just store a 32-bit value, perhaps
-// the ABI is unnecessarily complicated.
-template <class ELFT> static uint32_t readAndFeatures(const InputSection &sec) {
+// Read the following info from the .note.gnu.property section and write it to
+// the corresponding fields in `ObjFile`:
+// - Feature flags (32 bits) representing x86 or AArch64 features for
+//   hardware-assisted call flow control;
+// - AArch64 PAuth ABI core info (16 bytes).
+template <class ELFT>
+void readGnuProperty(const InputSection &sec, ObjFile<ELFT> &f) {
   using Elf_Nhdr = typename ELFT::Nhdr;
   using Elf_Note = typename ELFT::Note;
 
-  uint32_t featuresSet = 0;
   ArrayRef<uint8_t> data = sec.content();
-  auto reportFatal = [&](const uint8_t *place, const char *msg) {
+  auto reportFatal = [&](const uint8_t *place, const Twine &msg) {
     fatal(toString(sec.file) + ":(" + sec.name + "+0x" +
           Twine::utohexstr(place - sec.content().data()) + "): " + msg);
   };
@@ -983,7 +976,19 @@ template <class ELFT> static uint32_t readAndFeatures(const InputSection &sec) {
         // accumulate the bits set.
         if (size < 4)
           reportFatal(place, "FEATURE_1_AND entry is too short");
-        featuresSet |= read32<ELFT::Endianness>(desc.data());
+        f.andFeatures |= read32<ELFT::Endianness>(desc.data());
+      } else if (config->emachine == EM_AARCH64 &&
+                 type == GNU_PROPERTY_AARCH64_FEATURE_PAUTH) {
+        if (!f.aarch64PauthAbiCoreInfo.empty()) {
+          reportFatal(data.data(),
+                      "multiple GNU_PROPERTY_AARCH64_FEATURE_PAUTH entries are "
+                      "not supported");
+        } else if (size != 16) {
+          reportFatal(data.data(), "GNU_PROPERTY_AARCH64_FEATURE_PAUTH entry "
+                                   "is invalid: expected 16 bytes, but got " +
+                                       Twine(size));
+        }
+        f.aarch64PauthAbiCoreInfo = desc;
       }
 
       // Padding is present in the note descriptor, if necessary.
@@ -993,8 +998,6 @@ template <class ELFT> static uint32_t readAndFeatures(const InputSection &sec) {
     // Go to next NOTE record to look for more FEATURE_1_AND descriptions.
     data = data.slice(nhdr->getSize(sec.addralign));
   }
-
-  return featuresSet;
 }
 
 template <class ELFT>
@@ -1051,7 +1054,7 @@ InputSectionBase *ObjFile<ELFT>::createInputSection(uint32_t idx,
     // .note.gnu.property containing a single AND'ed bitmap, we discard an input
     // file's .note.gnu.property section.
     if (name == ".note.gnu.property") {
-      this->andFeatures = readAndFeatures<ELFT>(InputSection(*this, sec, name));
+      readGnuProperty<ELFT>(InputSection(*this, sec, name), *this);
       return &InputSection::discarded;
     }
 
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 95197599a2e1..834b3b63dd83 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -230,6 +230,7 @@ protected:
 public:
   uint32_t andFeatures = 0;
   bool hasCommonSyms = false;
+  ArrayRef<uint8_t> aarch64PauthAbiCoreInfo;
 };
 
 // .o file.
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 4f88313b868b..c06816bcfd56 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -676,6 +676,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
   case R_DTPREL:
   case R_RELAX_TLS_LD_TO_LE_ABS:
   case R_RELAX_GOT_PC_NOPIC:
+  case R_AARCH64_AUTH:
   case R_RISCV_ADD:
   case R_RISCV_LEB128:
     return sym.getVA(a);
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 92f2e200db11..55274344f881 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -995,7 +995,8 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
   if (e == R_GOT || e == R_PLT)
     return target->usesOnlyLowPageBits(type) || !config->isPic;
 
-  if (sym.isPreemptible)
+  // R_AARCH64_AUTH_ABS64 requires a dynamic relocation.
+  if (sym.isPreemptible || e == R_AARCH64_AUTH)
     return false;
   if (!config->isPic)
     return true;
@@ -1141,12 +1142,26 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
         (rel == target->symbolicRel && !sym.isPreemptible)) {
       addRelativeReloc<true>(*sec, offset, sym, addend, expr, type);
       return;
-    } else if (rel != 0) {
+    }
+    if (rel != 0) {
       if (config->emachine == EM_MIPS && rel == target->symbolicRel)
         rel = target->relativeRel;
       std::lock_guard<std::mutex> lock(relocMutex);
-      sec->getPartition().relaDyn->addSymbolReloc(rel, *sec, offset, sym,
-                                                  addend, type);
+      Partition &part = sec->getPartition();
+      if (config->emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64) {
+        // For a preemptible symbol, we can't use a relative relocation. For an
+        // undefined symbol, we can't compute offset at link-time and use a
+        // relative relocation. Use a symbolic relocation instead.
+        if (sym.isPreemptible) {
+          part.relaDyn->addSymbolReloc(type, *sec, offset, sym, addend, type);
+        } else {
+          part.relaDyn->addReloc({R_AARCH64_AUTH_RELATIVE, sec, offset,
+                                  DynamicReloc::AddendOnlyWithTargetVA, sym,
+                                  addend, R_ABS});
+        }
+        return;
+      }
+      part.relaDyn->addSymbolReloc(rel, *sec, offset, sym, addend, type);
 
       // MIPS ABI turns using of GOT and dynamic relocations inside out.
       // While regular ABI uses dynamic relocations to fill up GOT entries
@@ -1171,7 +1186,10 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
 
   // When producing an executable, we can perform copy relocations (for
   // STT_OBJECT) and canonical PLT (for STT_FUNC) if sym is defined by a DSO.
-  if (!config->shared && sym.isShared()) {
+  // Copy relocations/canonical PLT entries are unsupported for
+  // R_AARCH64_AUTH_ABS64.
+  if (!config->shared && sym.isShared() &&
+      !(config->emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64)) {
     if (!canDefineSymbolInExecutable(sym)) {
       errorOrWarn("cannot preempt symbol: " + toString(sym) +
                   getLocation(*sec, sym, offset));
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index 7eb8a811e693..b7b9c09e1b89 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -87,6 +87,7 @@ enum RelExpr {
   R_AARCH64_PAGE_PC,
   R_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC,
   R_AARCH64_TLSDESC_PAGE,
+  R_AARCH64_AUTH,
   R_ARM_PCA,
   R_ARM_SBREL,
   R_MIPS_GOTREL,
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index d4dc713b4e27..4427a1200868 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -314,22 +314,42 @@ GnuPropertySection::GnuPropertySection()
                        config->wordsize, ".note.gnu.property") {}
 
 void GnuPropertySection::writeTo(uint8_t *buf) {
+  write32(buf, 4);                          // Name size
+  write32(buf + 4, getSize() - 16);         // Content size
+  write32(buf + 8, NT_GNU_PROPERTY_TYPE_0); // Type
+  memcpy(buf + 12, "GNU", 4);               // Name string
+
   uint32_t featureAndType = config->emachine == EM_AARCH64
                                 ? GNU_PROPERTY_AARCH64_FEATURE_1_AND
                                 : GNU_PROPERTY_X86_FEATURE_1_AND;
 
-  write32(buf, 4);                                   // Name size
-  write32(buf + 4, config->is64 ? 16 : 12);          // Content size
-  write32(buf + 8, NT_GNU_PROPERTY_TYPE_0);          // Type
-  memcpy(buf + 12, "GNU", 4);                        // Name string
-  write32(buf + 16, featureAndType);                 // Feature type
-  write32(buf + 20, 4);                              // Feature size
-  write32(buf + 24, config->andFeatures);            // Feature flags
-  if (config->is64)
-    write32(buf + 28, 0); // Padding
+  unsigned offset = 16;
+  if (config->andFeatures != 0) {
+    write32(buf + offset + 0, featureAndType);      // Feature type
+    write32(buf + offset + 4, 4);                   // Feature size
+    write32(buf + offset + 8, config->andFeatures); // Feature flags
+    if (config->is64)
+      write32(buf + offset + 12, 0); // Padding
+    offset += 16;
+  }
+
+  if (!ctx.aarch64PauthAbiCoreInfo.empty()) {
+    write32(buf + offset + 0, GNU_PROPERTY_AARCH64_FEATURE_PAUTH);
+    write32(buf + offset + 4, ctx.aarch64PauthAbiCoreInfo.size());
+    memcpy(buf + offset + 8, ctx.aarch64PauthAbiCoreInfo.data(),
+           ctx.aarch64PauthAbiCoreInfo.size());
+  }
 }
 
-size_t GnuPropertySection::getSize() const { return config->is64 ? 32 : 28; }
+size_t GnuPropertySection::getSize() const {
+  uint32_t contentSize = 0;
+  if (config->andFeatures != 0)
+    contentSize += config->is64 ? 16 : 12;
+  if (!ctx.aarch64PauthAbiCoreInfo.empty())
+    contentSize += 4 + 4 + ctx.aarch64PauthAbiCoreInfo.size();
+  assert(contentSize != 0);
+  return contentSize + 16;
+}
 
 BuildIdSection::BuildIdSection()
     : SyntheticSection(SHF_ALLOC, SHT_NOTE, 4, ".note.gnu.build-id"),
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 40d617b7fdf3..fc9084f40044 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -24,6 +24,7 @@
 #include "lld/Common/CommonLinkerContext.h"
 #include "lld/Common/Filesystem.h"
 #include "lld/Common/Strings.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/BLAKE3.h"
 #include "llvm/Support/Parallel.h"
@@ -564,7 +565,7 @@ template <class ELFT> void elf::createSyntheticSections() {
   in.iplt = std::make_unique<IpltSection>();
   add(*in.iplt);
 
-  if (config->andFeatures)
+  if (config->andFeatures || !ctx.aarch64PauthAbiCoreInfo.empty())
     add(*make<GnuPropertySection>());
 
   // .note.GNU-stack is always added when we are creating a re-linkable
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 97ed06048910..bf0c8e55d103 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -29,6 +29,9 @@ ELF Improvements
 * ``--compress-sections <section-glib>=[none|zlib|zstd]`` is added to compress
   matched output sections without the ``SHF_ALLOC`` flag.
   (`#84855 <https://github.com/llvm/llvm-project/pull/84855>`_)
+* ``GNU_PROPERTY_AARCH64_FEATURE_PAUTH`` notes, ``R_AARCH64_AUTH_ABS64`` and
+  ``R_AARCH64_AUTH_RELATIVE`` relocations are now supported.
+  (`#72714 <https://github.com/llvm/llvm-project/pull/72714>`_)
 
 Breaking changes
 ----------------
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 65e50e349c8c..e0316730f442 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -762,6 +762,11 @@ Specify how to report the missing GNU_PROPERTY_X86_FEATURE_1_IBT or GNU_PROPERTY
 .Cm none
 is the default, linker will not report the missing property otherwise will be reported as a warning or an error.
 .Pp
+.It Cm pauth-report Ns = Ns Ar [none|warning|error]
+Specify how to report the missing GNU_PROPERTY_AARCH64_FEATURE_PAUTH property.
+.Cm none
+is the default, linker will not report the missing property otherwise will be reported as a warning or an error.
+.Pp
 .It Cm force-bti
 Force enable AArch64 BTI instruction in PLT, warn if Input ELF file does not have GNU_PROPERTY_AARCH64_FEATURE_1_BTI property.
 .Pp
diff --git a/lld/test/COFF/print-search-paths-arm64.s b/lld/test/COFF/print-search-paths-arm64.s
new file mode 100644
index 000000000000..fb5c8897a165
--- /dev/null
+++ b/lld/test/COFF/print-search-paths-arm64.s
@@ -0,0 +1,24 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -triple aarch64-windows-msvc %s -filetype=obj -o %t.aarch64.obj
+# RUN: lld-link -dll -noentry -winsysroot:%t.dir/sysroot -vctoolsversion:1.1.1.1 -winsdkversion:10.0.1 -libpath:custom-dir \
+# RUN:           %t.aarch64.obj -print-search-paths | FileCheck -DSYSROOT=%t.dir %s
+
+# RUN: llvm-mc -triple arm64ec-windows-msvc %s -filetype=obj -o %t.arm64ec.obj
+# RUN: lld-link -dll -noentry -winsysroot:%t.dir/sysroot -vctoolsversion:1.1.1.1 -winsdkversion:10.0.1 -libpath:custom-dir \
+# RUN:           %t.arm64ec.obj -print-search-paths -machine:arm64ec | FileCheck -DSYSROOT=%t.dir %s
+
+# CHECK: Library search paths:
+# CHECK-NEXT:   (cwd)
+# CHECK-NEXT:   custom-dir
+# CHECK-NEXT:   [[CPATH:.*]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib{{[/\\]}}windows
+# CHECK-NEXT:   [[CPATH]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib
+# CHECK-NEXT:   [[CPATH]]lib
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}DIA SDK{{[/\\]}}lib{{[/\\]}}arm64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}lib{{[/\\]}}arm64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}atlmfc{{[/\\]}}lib{{[/\\]}}arm64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}ucrt{{[/\\]}}arm64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}um{{[/\\]}}arm64
+
+        .data
+        .word 1
diff --git a/lld/test/ELF/aarch64-bti-pac-cli-error.s b/lld/test/ELF/aarch64-bti-pac-cli-error.s
index b8ab1a28fa5a..703c0aac6ea5 100644
--- a/lld/test/ELF/aarch64-bti-pac-cli-error.s
+++ b/lld/test/ELF/aarch64-bti-pac-cli-error.s
@@ -1,17 +1,22 @@
 # REQUIRES: x86
 # RUN: llvm-mc --triple=x86_64-pc-linux --filetype=obj -o %t.o %s
-# RUN: not ld.lld -z pac-plt -z force-bti -z bti-report=error %t.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: not ld.lld -z pac-plt -z force-bti -z bti-report=error   \
+# RUN:     -z pauth-report=error   %t.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: not ld.lld -z pac-plt -z force-bti -z bti-report=warning \
+# RUN:     -z pauth-report=warning %t.o -o /dev/null 2>&1 | FileCheck %s
 #
-## Check that we error if -z pac-plt, -z force-bti and -z bti-report=error are used when target is not
-## aarch64
+## Check that we error if -z pac-plt, -z force-bti are present and
+## -z bti-report and -z pauth-report are not none when target is not aarch64
 
 # CHECK: error: -z pac-plt only supported on AArch64
 # CHECK-NEXT: error: -z force-bti only supported on AArch64
 # CHECK-NEXT: error: -z bti-report only supported on AArch64
+# CHECK-NEXT: error: -z pauth-report only supported on AArch64
 
-# RUN: not ld.lld -z bti-report=something %t.o -o /dev/null 2>&1 | \
-# RUN:     FileCheck --check-prefix=REPORT_INVALID %s
+# RUN: not ld.lld -z bti-report=something -z pauth-report=something \
+# RUN:     %t.o -o /dev/null 2>&1 | FileCheck --check-prefix=REPORT_INVALID %s
 # REPORT_INVALID: error: -z bti-report= parameter something is not recognized
+# REPORT_INVALID: error: -z pauth-report= parameter something is not recognized
 # REPORT_INVALID-EMPTY:
 
         .globl start
diff --git a/lld/test/ELF/aarch64-feature-pauth.s b/lld/test/ELF/aarch64-feature-pauth.s
new file mode 100644
index 000000000000..699a650d7229
--- /dev/null
+++ b/lld/test/ELF/aarch64-feature-pauth.s
@@ -0,0 +1,114 @@
+# REQUIRES: aarch64
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag1.s -o tag1.o
+# RUN: cp tag1.o tag1a.o
+# RUN: ld.lld -shared tag1.o tag1a.o -o tagok.so
+# RUN: llvm-readelf -n tagok.so | FileCheck --check-prefix OK %s
+
+# OK: AArch64 PAuth ABI core info: platform 0x2a (unknown), version 0x1
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag2.s -o tag2.o
+# RUN: not ld.lld tag1.o tag1a.o tag2.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR1 %s
+
+# ERR1:      error: incompatible values of AArch64 PAuth core info found
+# ERR1-NEXT: >>> tag1.o: 0x2a000000000000000{{1|2}}00000000000000
+# ERR1-NEXT: >>> tag2.o: 0x2a000000000000000{{1|2}}00000000000000
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-short.s -o short.o
+# RUN: not ld.lld short.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR2 %s
+
+# ERR2: error: short.o:(.note.gnu.property+0x0): GNU_PROPERTY_AARCH64_FEATURE_PAUTH entry is invalid: expected 16 bytes, but got 12
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-long.s -o long.o
+# RUN: not ld.lld long.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR3 %s
+
+# ERR3: error: long.o:(.note.gnu.property+0x0): GNU_PROPERTY_AARCH64_FEATURE_PAUTH entry is invalid: expected 16 bytes, but got 24
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-multiple.s -o multiple.o
+# RUN: not ld.lld multiple.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR4 %s
+# ERR4: error: multiple.o:(.note.gnu.property+0x0): multiple GNU_PROPERTY_AARCH64_FEATURE_PAUTH entries are not supported
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu no-info.s -o noinfo1.o
+# RUN: cp noinfo1.o noinfo2.o
+# RUN: not ld.lld -z pauth-report=error noinfo1.o tag1.o noinfo2.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR5 %s
+# RUN: ld.lld -z pauth-report=warning noinfo1.o tag1.o noinfo2.o -o /dev/null 2>&1 | FileCheck --check-prefix WARN %s
+# RUN: ld.lld -z pauth-report=none noinfo1.o tag1.o noinfo2.o --fatal-warnings -o /dev/null
+
+# ERR5:      error: noinfo1.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one
+# ERR5-NEXT: error: noinfo2.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one
+# WARN:      warning: noinfo1.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one
+# WARN-NEXT: warning: noinfo2.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one
+
+#--- abi-tag-short.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 20
+.long 5
+.asciz "GNU"
+.long 0xc0000001
+.long 12
+.quad 2
+.long 31
+
+#--- abi-tag-long.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 32
+.long 5
+.asciz "GNU"
+.long 0xc0000001
+.long 24
+.quad 2
+.quad 31
+.quad 0
+
+#--- abi-tag-multiple.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 48
+.long 5
+.asciz "GNU"
+.long 0xc0000001
+.long 16
+.quad 42 // platform
+.quad 1  // version
+.long 0xc0000001
+.long 16
+.quad 42 // platform
+.quad 1  // version
+
+#--- abi-tag1.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 24
+.long 5
+.asciz "GNU"
+.long 0xc0000001
+.long 16
+.quad 42 // platform
+.quad 1  // version
+
+#--- abi-tag2.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 24
+.long 5
+.asciz "GNU"
+.long 0xc0000001
+.long 16
+.quad 42 // platform
+.quad 2  // version
+
+#--- no-info.s
+
+## define _start to avoid missing entry warning and use --fatal-warnings to assert no diagnostic
+## allow multiple definitions of _start for simplicity
+.weak _start;
+_start:
diff --git a/lld/test/ELF/aarch64-reloc-pauth-ro.s b/lld/test/ELF/aarch64-reloc-pauth-ro.s
new file mode 100644
index 000000000000..1be78ba4666e
--- /dev/null
+++ b/lld/test/ELF/aarch64-reloc-pauth-ro.s
@@ -0,0 +1,22 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %p/Inputs/shared2.s -o %t.so.o
+# RUN: ld.lld -shared %t.so.o -soname=so -o %t.so
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o
+# RUN: not ld.lld -pie %t.o %t.so -o %t2 2>&1 | FileCheck -DFILE=%t %s --implicit-check-not=error:
+
+# CHECK:      error: relocation R_AARCH64_AUTH_ABS64 cannot be used against symbol 'zed2'; recompile with -fPIC
+# CHECK-NEXT: >>> defined in [[FILE]].so
+# CHECK-NEXT: >>> referenced by [[FILE]].o:(.ro+0x0)
+
+# CHECK:      error: relocation R_AARCH64_AUTH_ABS64 cannot be used against symbol 'bar2'; recompile with -fPIC
+# CHECK:      error: relocation R_AARCH64_AUTH_ABS64 cannot be used against local symbol; recompile with -fPIC
+
+foo:
+.type foo, @function
+
+.section .ro, "a"
+.p2align 3
+.quad zed2@AUTH(da,42)
+.quad bar2@AUTH(ia,42)
+.quad foo@AUTH(ia,42)
diff --git a/lld/test/ELF/aarch64-reloc-pauth.s b/lld/test/ELF/aarch64-reloc-pauth.s
new file mode 100644
index 000000000000..b603d8ffdcab
--- /dev/null
+++ b/lld/test/ELF/aarch64-reloc-pauth.s
@@ -0,0 +1,108 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %p/Inputs/shared2.s -o %t.a.o
+# RUN: ld.lld -shared %t.a.o -soname=so -o %t.a.so
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o
+
+# RUN: ld.lld -pie %t.o %t.a.so -o %t
+# RUN: llvm-readobj -r %t | FileCheck --check-prefix=UNPACKED %s
+
+# UNPACKED:          Section ({{.+}}) .rela.dyn {
+# UNPACKED-NEXT:       0x30470 R_AARCH64_AUTH_RELATIVE - 0x1
+# UNPACKED-NEXT:       0x30478 R_AARCH64_AUTH_RELATIVE - 0x30472
+# UNPACKED-NEXT:       0x30480 R_AARCH64_AUTH_RELATIVE - 0xFFFFFFFFFFFFFFFD
+# UNPACKED-NEXT:       0x30488 R_AARCH64_AUTH_RELATIVE - 0x12345678
+# UNPACKED-NEXT:       0x30490 R_AARCH64_AUTH_RELATIVE - 0x123456789A
+# UNPACKED-NEXT:       0x30498 R_AARCH64_AUTH_RELATIVE - 0xFFFFFFEDCBA98766
+# UNPACKED-NEXT:       0x304A0 R_AARCH64_AUTH_RELATIVE - 0x8003046F
+# UNPACKED-NEXT:       0x304B9 R_AARCH64_AUTH_RELATIVE - 0x4
+# UNPACKED-NEXT:       0x304C2 R_AARCH64_AUTH_RELATIVE - 0x30475
+# UNPACKED-NEXT:       0x304A8 R_AARCH64_AUTH_ABS64 zed2 0x1111
+# UNPACKED-NEXT:       0x304B0 R_AARCH64_AUTH_ABS64 bar2 0x0
+# UNPACKED-NEXT:     }
+
+# RUN: ld.lld %t.o %t.a.so -o %t.nopie
+# RUN: llvm-readobj -r %t.nopie | FileCheck --check-prefix=NOPIE %s
+
+# NOPIE:      Section ({{.+}}) .rela.dyn {
+# NOPIE:        0x230460 R_AARCH64_AUTH_RELATIVE - 0x200001
+# NOPIE-NEXT:   0x230468 R_AARCH64_AUTH_RELATIVE - 0x230462
+# NOPIE-NEXT:   0x230470 R_AARCH64_AUTH_RELATIVE - 0x1FFFFD
+# NOPIE-NEXT:   0x230478 R_AARCH64_AUTH_RELATIVE - 0x12545678
+# NOPIE-NEXT:   0x230480 R_AARCH64_AUTH_RELATIVE - 0x123476789A
+# NOPIE-NEXT:   0x230488 R_AARCH64_AUTH_RELATIVE - 0xFFFFFFEDCBC98766
+# NOPIE-NEXT:   0x230490 R_AARCH64_AUTH_RELATIVE - 0x8023045F
+# NOPIE-NEXT:   0x2304A9 R_AARCH64_AUTH_RELATIVE - 0x200004
+# NOPIE-NEXT:   0x2304B2 R_AARCH64_AUTH_RELATIVE - 0x230465
+# NOPIE-NEXT:   0x230498 R_AARCH64_AUTH_ABS64 zed2 0x1111
+# NOPIE-NEXT:   0x2304A0 R_AARCH64_AUTH_ABS64 bar2 0x0
+# NOPIE-NEXT: }
+
+# RUN: ld.lld -pie %t.o %t.a.so -o %t.pie
+# RUN: llvm-readelf -S -d -r -x .test %t.pie | FileCheck --check-prefixes=PIE,HEX %s
+
+# PIE:      Section Headers:
+# PIE-NEXT: Name Type Address Off Size ES Flg Lk Inf Al
+# PIE:      .rela.dyn RELA {{0*}}[[#%x,ADDR1:]]
+# PIE-SAME:                                     {{0*}}[[#ADDR1]] 000108 18 A 1 0 8
+
+# PIE:      Relocation section '.rela.dyn' at offset 0x[[#ADDR1]] contains 11 entries:
+# PIE-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# PIE-NEXT: 0000000000030470  0000000000000411 R_AARCH64_AUTH_RELATIVE 1
+# PIE-NEXT: 0000000000030478  0000000000000411 R_AARCH64_AUTH_RELATIVE 30472
+# PIE-NEXT: 0000000000030480  0000000000000411 R_AARCH64_AUTH_RELATIVE fffffffffffffffd
+# PIE-NEXT: 0000000000030488  0000000000000411 R_AARCH64_AUTH_RELATIVE 12345678
+# PIE-NEXT: 0000000000030490  0000000000000411 R_AARCH64_AUTH_RELATIVE 123456789a
+# PIE-NEXT: 0000000000030498  0000000000000411 R_AARCH64_AUTH_RELATIVE ffffffedcba98766
+# PIE-NEXT: 00000000000304a0  0000000000000411 R_AARCH64_AUTH_RELATIVE 8003046f
+# PIE-NEXT: 00000000000304b9  0000000000000411 R_AARCH64_AUTH_RELATIVE 4
+# PIE-NEXT: 00000000000304c2  0000000000000411 R_AARCH64_AUTH_RELATIVE 30475
+# PIE-NEXT: 00000000000304a8  0000000100000244 R_AARCH64_AUTH_ABS64   0000000000000000 zed2 + 1111
+# PIE-NEXT: 00000000000304b0  0000000200000244 R_AARCH64_AUTH_ABS64   0000000000000000 bar2 + 0
+
+# HEX:      Hex dump of section '.test':
+# HEX-NEXT: 0x00030470 00000000 2a000020 00000000 2b000000
+##                              ^^^^ Discr = 42
+##                                    ^^ Key (bits 5..6) = DA
+##                                                ^^^^ Discr = 43
+##                                                      ^^ Key (bits 5..6) = IA
+# HEX-NEXT: 0x00030480 00000000 2c000080 00000000 2d000020
+##                              ^^^^ Discr = 44
+##                                    ^^ Key (bits 5..6) = IA
+##                                    ^^ Addr diversity (bit 7) = true
+##                                                ^^^^ Discr = 45
+##                                                      ^^ Key (bits 5..6) = DA
+# HEX-NEXT: 0x00030490 00000000 2e000020 00000000 2f000020
+##                              ^^^^ Discr = 46
+##                                    ^^ Key (bits 5..6) = DA
+##                                                ^^^^ Discr = 47
+##                                                      ^^ Key (bits 5..6) = DA
+# HEX-NEXT: 0x000304a0 00000000 30000020 00000000 31000020
+##                              ^^^^ Discr = 48
+##                                    ^^ Key (bits 5..6) = DA
+##                                                ^^^^ Discr = 49
+##                                                      ^^ Key (bits 5..6) = DA
+# HEX-NEXT: 0x000304b0 00000000 32000000 77000000 00330000
+##                              ^^^^ Discr = 50
+##                                    ^^ Key (bits 5..6) = IA
+##                                                  ^^^^ Discr = 51
+# HEX-NEXT: 0x000304c0 20770000 00003400 0020{{\ }}
+##                     ^^ Key (bits 5..6) = DA
+##                                  ^^^^ Discr = 52
+##                                         ^^ Key (bits 5..6) = DA
+
+.section .test, "aw"
+.p2align 3
+.quad (__ehdr_start + 1)@AUTH(da,42)
+.quad (.test + 2)@AUTH(ia,43)
+.quad (__ehdr_start - 3)@AUTH(ia,44,addr)
+.quad (__ehdr_start + 0x12345678)@AUTH(da,45)
+.quad (__ehdr_start + 0x123456789A)@AUTH(da,46)
+.quad (__ehdr_start - 0x123456789A)@AUTH(da,47)
+.quad (.test + 0x7FFFFFFF)@AUTH(da,48)
+.quad (zed2 + 0x1111)@AUTH(da,49)
+.quad bar2@AUTH(ia,50)
+.byte 0x77
+.quad (__ehdr_start + 4)@AUTH(da,51)
+.byte 0x77
+.quad (.test + 5)@AUTH(da,52)
diff --git a/lld/test/ELF/linkerscript/symbolreferenced.s b/lld/test/ELF/linkerscript/symbolreferenced.s
index 6f583d20e276..684808269083 100644
--- a/lld/test/ELF/linkerscript/symbolreferenced.s
+++ b/lld/test/ELF/linkerscript/symbolreferenced.s
@@ -50,6 +50,21 @@
 # RUN: not ld.lld -T chain2.t a.o 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error:
 # ERR-COUNT-3: error: chain2.t:1: symbol not found: undef
 
+## _start in a lazy object file references PROVIDE symbols. We extract _start
+## earlier to avoid spurious "symbol not found" errors.
+# RUN: llvm-mc -filetype=obj -triple=x86_64 undef.s -o undef.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 start.s -o start.o
+# RUN: ld.lld -T chain2.t undef.o --start-lib start.o --end-lib -o lazy
+# RUN: llvm-nm lazy | FileCheck %s --check-prefix=LAZY
+# RUN: ld.lld -e 0 -T chain2.t --undefined-glob '_start*' undef.o --start-lib start.o --end-lib -o lazy
+# RUN: llvm-nm lazy | FileCheck %s --check-prefix=LAZY
+
+# LAZY:      T _start
+# LAZY-NEXT: t f1
+# LAZY-NEXT: T f2
+# LAZY-NEXT: T newsym
+# LAZY-NEXT: T unde
+
 #--- a.s
 .global _start
 _start:
@@ -89,3 +104,13 @@ PROVIDE(newsym = f1);
 PROVIDE(f2 = undef);
 PROVIDE_HIDDEN(f1 = f2);
 PROVIDE(newsym = f1);
+
+#--- undef.s
+.globl undef
+undef: ret
+
+#--- start.s
+.globl _start
+_start: ret
+.data
+.quad newsym
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 985d16eb11cf..57d6280d57c8 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -309,6 +309,14 @@ them.
  Compress DWARF debug sections in the output, using the specified format.
  Supported formats are ``zlib`` and ``zstd``. Use ``zlib`` if ``<format>`` is omitted.
 
+.. option:: --compress-sections <section>=<format>
+
+ Compress or decompress sections matched by ``<section>`` using the specified
+ format. Supported formats are ``zlib`` and ``zstd``. Specify ``none`` for
+ decompression. When a section is matched by multiple options, the last one
+ wins. A wildcard ``<section>`` starting with '!' is disallowed.
+ Sections within a segment cannot be (de)compressed.
+
 .. option:: --decompress-debug-sections
 
  Decompress any compressed DWARF debug sections in the output.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 1d4ff5238226..774729c5f083 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -5557,6 +5557,8 @@ RISC-V:
 
 Sparc:
 
+- ``L``: Print the low-order register of a two-register operand.
+- ``H``: Print the high-order register of a two-register operand.
 - ``r``: No effect.
 
 SystemZ:
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 7588048334d7..ff7fed9ad590 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -182,6 +182,10 @@ Changes to the LLVM tools
   for ELF input to skip the specified symbols when executing other options
   that can change a symbol's name, binding or visibility.
 
+* llvm-objcopy now supports ``--compress-sections`` to compress or decompress
+  arbitrary sections not within a segment.
+  (`#85036 <https://github.com/llvm/llvm-project/pull/85036>`_.)
+
 * llvm-profgen now supports COFF+DWARF binaries. This enables Sample-based PGO
   on Windows using Intel VTune's SEP. For details on usage, see the `end-user
   documentation for SPGO
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index b9b39f3b9dfb..8d3c029b2e7e 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -1740,8 +1740,8 @@ public:
     return *this;
   }
 
-  /// \returns the multiplicative inverse for a given modulo.
-  APInt multiplicativeInverse(const APInt &modulo) const;
+  /// \returns the multiplicative inverse of an odd APInt modulo 2^BitWidth.
+  APInt multiplicativeInverse() const;
 
   /// @}
   /// \name Building-block Operations for APInt and APFloat
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index bad0a77b0f2d..fa9392b86c15 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -190,7 +190,10 @@ enum class TailFoldingStyle {
   /// Use predicate to control both data and control flow, but modify
   /// the trip count so that a runtime overflow check can be avoided
   /// and such that the scalar epilogue loop can always be removed.
-  DataAndControlFlowWithoutRuntimeCheck
+  DataAndControlFlowWithoutRuntimeCheck,
+  /// Use predicated EVL instructions for tail-folding.
+  /// Indicates that VP intrinsics should be used.
+  DataWithEVL,
 };
 
 struct TailFoldingInfo {
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 877f3f7862c8..ed267c100b76 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1712,11 +1712,6 @@ enum {
   NT_ANDROID_TYPE_MEMTAG = 4,
 };
 
-// ARM note types.
-enum {
-  NT_ARM_TYPE_PAUTH_ABI_TAG = 1,
-};
-
 // Memory tagging values used in NT_ANDROID_TYPE_MEMTAG notes.
 enum {
   // Enumeration to determine the tagging mode. In Android-land, 'SYNC' means
@@ -1740,6 +1735,7 @@ enum : unsigned {
   GNU_PROPERTY_STACK_SIZE = 1,
   GNU_PROPERTY_NO_COPY_ON_PROTECTED = 2,
   GNU_PROPERTY_AARCH64_FEATURE_1_AND = 0xc0000000,
+  GNU_PROPERTY_AARCH64_FEATURE_PAUTH = 0xc0000001,
   GNU_PROPERTY_X86_FEATURE_1_AND = 0xc0000002,
 
   GNU_PROPERTY_X86_UINT32_OR_LO = 0xc0008000,
@@ -1758,6 +1754,26 @@ enum : unsigned {
   GNU_PROPERTY_AARCH64_FEATURE_1_GCS = 1 << 2,
 };
 
+// aarch64 PAuth platforms.
+enum : unsigned {
+  AARCH64_PAUTH_PLATFORM_INVALID = 0x0,
+  AARCH64_PAUTH_PLATFORM_BAREMETAL = 0x1,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX = 0x10000002,
+};
+
+// Bit positions of version flags for AARCH64_PAUTH_PLATFORM_LLVM_LINUX.
+enum : unsigned {
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INTRINSICS = 0,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_CALLS = 1,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_RETURNS = 2,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_AUTHTRAPS = 3,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRADDRDISCR = 4,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRTYPEDISCR = 5,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI = 6,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST =
+      AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI,
+};
+
 // x86 processor feature bits.
 enum : unsigned {
   GNU_PROPERTY_X86_FEATURE_1_IBT = 1 << 0,
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
index 5fb3fa4aeb7b..cb05db85e2b5 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
@@ -1,18 +1,19 @@
-
 #ifndef ELF_RELOC
 #error "ELF_RELOC must be defined"
 #endif
 
-// Based on ABI release 1.1-beta, dated 6 November 2013. NB: The cover page of
-// this document, IHI0056C_beta_aaelf64.pdf, on infocenter.arm.com, still
-// labels this as release 1.0.
+// Based on released ABI: https://github.com/ARM-software/abi-aa, aaelf64.
+// ELF64
+// Null relocation: also 0x100 for ELF64
 ELF_RELOC(R_AARCH64_NONE,                                0)
+// Data relocations
 ELF_RELOC(R_AARCH64_ABS64,                           0x101)
 ELF_RELOC(R_AARCH64_ABS32,                           0x102)
 ELF_RELOC(R_AARCH64_ABS16,                           0x103)
 ELF_RELOC(R_AARCH64_PREL64,                          0x104)
 ELF_RELOC(R_AARCH64_PREL32,                          0x105)
 ELF_RELOC(R_AARCH64_PREL16,                          0x106)
+// Static AArch64 relocations
 ELF_RELOC(R_AARCH64_MOVW_UABS_G0,                    0x107)
 ELF_RELOC(R_AARCH64_MOVW_UABS_G0_NC,                 0x108)
 ELF_RELOC(R_AARCH64_MOVW_UABS_G1,                    0x109)
@@ -60,11 +61,13 @@ ELF_RELOC(R_AARCH64_LD64_GOT_LO12_NC,                0x138)
 ELF_RELOC(R_AARCH64_LD64_GOTPAGE_LO15,               0x139)
 ELF_RELOC(R_AARCH64_PLT32,                           0x13a)
 ELF_RELOC(R_AARCH64_GOTPCREL32,                      0x13b)
+// General dynamic TLS relocations
 ELF_RELOC(R_AARCH64_TLSGD_ADR_PREL21,                0x200)
 ELF_RELOC(R_AARCH64_TLSGD_ADR_PAGE21,                0x201)
 ELF_RELOC(R_AARCH64_TLSGD_ADD_LO12_NC,               0x202)
 ELF_RELOC(R_AARCH64_TLSGD_MOVW_G1,                   0x203)
 ELF_RELOC(R_AARCH64_TLSGD_MOVW_G0_NC,                0x204)
+// Local dynamic TLS relocations
 ELF_RELOC(R_AARCH64_TLSLD_ADR_PREL21,                0x205)
 ELF_RELOC(R_AARCH64_TLSLD_ADR_PAGE21,                0x206)
 ELF_RELOC(R_AARCH64_TLSLD_ADD_LO12_NC,               0x207)
@@ -92,6 +95,7 @@ ELF_RELOC(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC,       0x21c)
 ELF_RELOC(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21,       0x21d)
 ELF_RELOC(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC,     0x21e)
 ELF_RELOC(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19,        0x21f)
+// Local exec TLS relocations
 ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G2,             0x220)
 ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G1,             0x221)
 ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC,          0x222)
@@ -108,6 +112,7 @@ ELF_RELOC(R_AARCH64_TLSLE_LDST32_TPREL_LO12,         0x22c)
 ELF_RELOC(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC,      0x22d)
 ELF_RELOC(R_AARCH64_TLSLE_LDST64_TPREL_LO12,         0x22e)
 ELF_RELOC(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC,      0x22f)
+// TLS descriptor relocations
 ELF_RELOC(R_AARCH64_TLSDESC_LD_PREL19,               0x230)
 ELF_RELOC(R_AARCH64_TLSDESC_ADR_PREL21,              0x231)
 ELF_RELOC(R_AARCH64_TLSDESC_ADR_PAGE21,              0x232)
@@ -122,8 +127,7 @@ ELF_RELOC(R_AARCH64_TLSLE_LDST128_TPREL_LO12,        0x23a)
 ELF_RELOC(R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC,     0x23b)
 ELF_RELOC(R_AARCH64_TLSLD_LDST128_DTPREL_LO12,       0x23c)
 ELF_RELOC(R_AARCH64_TLSLD_LDST128_DTPREL_LO12_NC,    0x23d)
-ELF_RELOC(R_AARCH64_AUTH_ABS64,                      0x244)
-// Dynamic relocations start
+// Dynamic relocations
 ELF_RELOC(R_AARCH64_COPY,                            0x400)
 ELF_RELOC(R_AARCH64_GLOB_DAT,                        0x401)
 ELF_RELOC(R_AARCH64_JUMP_SLOT,                       0x402)
@@ -136,8 +140,12 @@ ELF_RELOC(R_AARCH64_TLS_DTPREL64,                    0x405)
 ELF_RELOC(R_AARCH64_TLS_TPREL64,                     0x406)
 ELF_RELOC(R_AARCH64_TLSDESC,                         0x407)
 ELF_RELOC(R_AARCH64_IRELATIVE,                       0x408)
+// PAuthABI static and dynamic relocations: defined in pauthabielf64,
+// https://github.com/ARM-software/abi-aa
+ELF_RELOC(R_AARCH64_AUTH_ABS64,                      0x244)
 ELF_RELOC(R_AARCH64_AUTH_RELATIVE,                   0x411)
 
+// ELF32
 // ELF_RELOC(R_AARCH64_P32_NONE,                         0)
 ELF_RELOC(R_AARCH64_P32_ABS32,                       0x001)
 ELF_RELOC(R_AARCH64_P32_ABS16,                       0x002)
@@ -216,7 +224,7 @@ ELF_RELOC(R_AARCH64_P32_TLSDESC_ADR_PAGE21,          0x07c)
 ELF_RELOC(R_AARCH64_P32_TLSDESC_LD32_LO12,           0x07d)
 ELF_RELOC(R_AARCH64_P32_TLSDESC_ADD_LO12,            0x07e)
 ELF_RELOC(R_AARCH64_P32_TLSDESC_CALL,                0x07f)
-// Dynamic relocations start
+// Dynamic relocations
 ELF_RELOC(R_AARCH64_P32_COPY,                        0x0b4)
 ELF_RELOC(R_AARCH64_P32_GLOB_DAT,                    0x0b5)
 ELF_RELOC(R_AARCH64_P32_JUMP_SLOT,                   0x0b6)
diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index 152f781ffa9b..97c3e4d7589d 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -36,7 +36,8 @@ namespace llvm {
   /// for upgrading, and returns true if it requires upgrading. It may return
   /// null in NewFn if the all calls to the original intrinsic function
   /// should be transformed to non-function-call instructions.
-  bool UpgradeIntrinsicFunction(Function *F, Function *&NewFn);
+  bool UpgradeIntrinsicFunction(Function *F, Function *&NewFn,
+                                bool CanUpgradeDebugIntrinsicsToRecords = true);
 
   /// This is the complement to the above, replacing a specific call to an
   /// intrinsic function with a call to the specified new function.
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index c9477131c09c..9f4987493739 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -659,6 +659,25 @@ getDbgRecordRange(DbgMarker *DebugMarker) {
 
 DEFINE_ISA_CONVERSION_FUNCTIONS(DbgRecord, LLVMDbgRecordRef)
 
+/// Used to temporarily set the debug info format of a function, module, or
+/// basic block for the duration of this object's lifetime, after which the
+/// prior state will be restored.
+template <typename T> class ScopedDbgInfoFormatSetter {
+  T &Obj;
+  bool OldState;
+
+public:
+  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
+    Obj.setIsNewDbgInfoFormat(NewState);
+  }
+  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
+};
+
+template <typename T>
+ScopedDbgInfoFormatSetter(T &Obj,
+                          bool NewState) -> ScopedDbgInfoFormatSetter<T>;
+
 } // namespace llvm
 
 #endif // LLVM_IR_DEBUGPROGRAMINSTRUCTION_H
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index c04f4c526921..f0723a633f0f 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1733,8 +1733,7 @@ def int_ubsantrap : Intrinsic<[], [llvm_i8_ty],
 
 // Return true if ubsan check is allowed.
 def int_allow_ubsan_check : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i8_ty],
-    [IntrInaccessibleMemOnly, IntrWriteMem, ImmArg<ArgIndex<0>>, NoUndef<RetIndex>]>,
-    ClangBuiltin<"__builtin_allow_ubsan_check">;
+    [IntrInaccessibleMemOnly, IntrWriteMem, ImmArg<ArgIndex<0>>, NoUndef<RetIndex>]>;
 
 // Return true if runtime check is allowed.
 def int_allow_runtime_check : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_metadata_ty],
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index 108465478d37..d701481202f8 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -64,23 +64,6 @@ extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
 
 namespace llvm {
 
-// RemoveDIs: Provide facilities for converting debug-info from one form to
-// another, which are no-ops for everything but modules.
-template <class IRUnitT> inline bool shouldConvertDbgInfo(IRUnitT &IR) {
-  return false;
-}
-template <> inline bool shouldConvertDbgInfo(Module &IR) {
-  return !IR.IsNewDbgInfoFormat && UseNewDbgInfoFormat;
-}
-template <class IRUnitT> inline void doConvertDbgInfoToNew(IRUnitT &IR) {}
-template <> inline void doConvertDbgInfoToNew(Module &IR) {
-  IR.convertToNewDbgValues();
-}
-template <class IRUnitT> inline void doConvertDebugInfoToOld(IRUnitT &IR) {}
-template <> inline void doConvertDebugInfoToOld(Module &IR) {
-  IR.convertFromNewDbgValues();
-}
-
 // Forward declare the analysis manager template.
 template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
 
@@ -229,9 +212,7 @@ public:
 
     // RemoveDIs: if requested, convert debug-info to DbgRecord representation
     // for duration of these passes.
-    bool ShouldConvertDbgInfo = shouldConvertDbgInfo(IR);
-    if (ShouldConvertDbgInfo)
-      doConvertDbgInfoToNew(IR);
+    ScopedDbgInfoFormatSetter FormatSetter(IR, UseNewDbgInfoFormat);
 
     for (auto &Pass : Passes) {
       // Check the PassInstrumentation's BeforePass callbacks before running the
@@ -255,9 +236,6 @@ public:
       PA.intersect(std::move(PassPA));
     }
 
-    if (ShouldConvertDbgInfo)
-      doConvertDebugInfoToOld(IR);
-
     // Invalidation was handled after each pass in the above loop for the
     // current unit of IR. Therefore, the remaining analysis results in the
     // AnalysisManager are preserved. We mark this with a set so that we don't
diff --git a/llvm/include/llvm/IR/PrintPasses.h b/llvm/include/llvm/IR/PrintPasses.h
index 3803bd05cbe5..95b97e76c867 100644
--- a/llvm/include/llvm/IR/PrintPasses.h
+++ b/llvm/include/llvm/IR/PrintPasses.h
@@ -78,25 +78,6 @@ std::string doSystemDiff(StringRef Before, StringRef After,
                          StringRef OldLineFormat, StringRef NewLineFormat,
                          StringRef UnchangedLineFormat);
 
-/// Used to temporarily set the debug info format of a function, module, or
-/// basic block for the duration of this object's lifetime, after which the
-/// prior state will be restored.
-template <typename T> class ScopedDbgInfoFormatSetter {
-  T &Obj;
-  bool OldState;
-
-public:
-  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
-    Obj.setIsNewDbgInfoFormat(NewState);
-  }
-  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
-};
-
-template <typename T>
-ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-    -> ScopedDbgInfoFormatSetter<T>;
-
 } // namespace llvm
 
 #endif // LLVM_IR_PRINTPASSES_H
diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h
index 9d6d5fb23b18..ae08d4032736 100644
--- a/llvm/include/llvm/ObjCopy/CommonConfig.h
+++ b/llvm/include/llvm/ObjCopy/CommonConfig.h
@@ -262,6 +262,9 @@ struct CommonConfig {
   bool DecompressDebugSections = false;
 
   DebugCompressionType CompressionType = DebugCompressionType::None;
+
+  SmallVector<std::pair<NameMatcher, llvm::DebugCompressionType>, 0>
+      compressSections;
 };
 
 } // namespace objcopy
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index ff00900a1466..0431c182276e 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -22,6 +22,8 @@ enum IndexedVersion : uint64_t {
   Version0 = 0,
   // Version 1: Added a version field to the header.
   Version1 = 1,
+  // Version 2: Added a call stack table.  Under development.
+  Version2 = 2,
 };
 
 constexpr uint64_t MinimumSupportedVersion = Version0;
@@ -289,23 +291,14 @@ struct IndexedAllocationInfo {
       : CallStack(CS.begin(), CS.end()), CSId(CSId), Info(MB) {}
 
   // Returns the size in bytes when this allocation info struct is serialized.
-  size_t serializedSize() const {
-    return sizeof(uint64_t) + // The number of frames to serialize.
-           sizeof(FrameId) * CallStack.size() +    // The callstack frame ids.
-           PortableMemInfoBlock::serializedSize(); // The size of the payload.
-  }
+  size_t serializedSize(IndexedVersion Version) const;
 
   bool operator==(const IndexedAllocationInfo &Other) const {
     if (Other.Info != Info)
       return false;
 
-    if (Other.CallStack.size() != CallStack.size())
+    if (Other.CSId != CSId)
       return false;
-
-    for (size_t J = 0; J < Other.CallStack.size(); J++) {
-      if (Other.CallStack[J] != CallStack[J])
-        return false;
-    }
     return true;
   }
 
@@ -357,6 +350,9 @@ struct IndexedMemProfRecord {
   // inline location list may include additional entries, users should pick
   // the last entry in the list with the same function GUID.
   llvm::SmallVector<llvm::SmallVector<FrameId>> CallSites;
+  // Conceptually the same as above.  We are going to keep both CallSites and
+  // CallSiteIds while we are transitioning from CallSites to CallSiteIds.
+  llvm::SmallVector<CallStackId> CallSiteIds;
 
   void clear() {
     AllocSites.clear();
@@ -370,47 +366,31 @@ struct IndexedMemProfRecord {
     CallSites.append(Other.CallSites);
   }
 
-  size_t serializedSize() const {
-    size_t Result = sizeof(GlobalValue::GUID);
-    for (const IndexedAllocationInfo &N : AllocSites)
-      Result += N.serializedSize();
-
-    // The number of callsites we have information for.
-    Result += sizeof(uint64_t);
-    for (const auto &Frames : CallSites) {
-      // The number of frame ids to serialize.
-      Result += sizeof(uint64_t);
-      Result += Frames.size() * sizeof(FrameId);
-    }
-    return Result;
-  }
+  size_t serializedSize(IndexedVersion Version) const;
 
   bool operator==(const IndexedMemProfRecord &Other) const {
     if (Other.AllocSites.size() != AllocSites.size())
       return false;
 
-    if (Other.CallSites.size() != CallSites.size())
-      return false;
-
     for (size_t I = 0; I < AllocSites.size(); I++) {
       if (AllocSites[I] != Other.AllocSites[I])
         return false;
     }
 
-    for (size_t I = 0; I < CallSites.size(); I++) {
-      if (CallSites[I] != Other.CallSites[I])
-        return false;
-    }
+    if (Other.CallSiteIds != CallSiteIds)
+      return false;
     return true;
   }
 
   // Serializes the memprof records in \p Records to the ostream \p OS based
   // on the schema provided in \p Schema.
-  void serialize(const MemProfSchema &Schema, raw_ostream &OS);
+  void serialize(const MemProfSchema &Schema, raw_ostream &OS,
+                 IndexedVersion Version);
 
   // Deserializes memprof records from the Buffer.
   static IndexedMemProfRecord deserialize(const MemProfSchema &Schema,
-                                          const unsigned char *Buffer);
+                                          const unsigned char *Buffer,
+                                          IndexedVersion Version);
 
   // Returns the GUID for the function name after canonicalization. For
   // memprof, we remove any .llvm suffix added by LTO. MemProfRecords are
@@ -480,7 +460,8 @@ public:
   using offset_type = uint64_t;
 
   RecordLookupTrait() = delete;
-  RecordLookupTrait(const MemProfSchema &S) : Schema(S) {}
+  RecordLookupTrait(IndexedVersion V, const MemProfSchema &S)
+      : Version(V), Schema(S) {}
 
   static bool EqualKey(uint64_t A, uint64_t B) { return A == B; }
   static uint64_t GetInternalKey(uint64_t K) { return K; }
@@ -507,11 +488,13 @@ public:
 
   data_type ReadData(uint64_t K, const unsigned char *D,
                      offset_type /*Unused*/) {
-    Record = IndexedMemProfRecord::deserialize(Schema, D);
+    Record = IndexedMemProfRecord::deserialize(Schema, D, Version);
     return Record;
   }
 
 private:
+  // Holds the MemProf version.
+  IndexedVersion Version;
   // Holds the memprof schema used to deserialize records.
   MemProfSchema Schema;
   // Holds the records from one function deserialized from the indexed format.
@@ -534,19 +517,23 @@ public:
   // we must use a default constructor with no params for the writer trait so we
   // have a public member which must be initialized by the user.
   MemProfSchema *Schema = nullptr;
+  // The MemProf version to use for the serialization.
+  IndexedVersion Version;
 
-  RecordWriterTrait() = default;
+  // We do not support the default constructor, which does not set Version.
+  RecordWriterTrait() = delete;
+  RecordWriterTrait(IndexedVersion V) : Version(V) {}
 
   static hash_value_type ComputeHash(key_type_ref K) { return K; }
 
-  static std::pair<offset_type, offset_type>
+  std::pair<offset_type, offset_type>
   EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
     using namespace support;
 
     endian::Writer LE(Out, llvm::endianness::little);
     offset_type N = sizeof(K);
     LE.write<offset_type>(N);
-    offset_type M = V.serializedSize();
+    offset_type M = V.serializedSize(Version);
     LE.write<offset_type>(M);
     return std::make_pair(N, M);
   }
@@ -560,7 +547,7 @@ public:
   void EmitData(raw_ostream &Out, key_type_ref /*Unused*/, data_type_ref V,
                 offset_type /*Unused*/) {
     assert(Schema != nullptr && "MemProf schema is not initialized!");
-    V.serialize(*Schema, Out);
+    V.serialize(*Schema, Out, Version);
     // Clear the IndexedMemProfRecord which results in clearing/freeing its
     // vectors of allocs and callsites. This is owned by the associated on-disk
     // hash table, but unused after this point. See also the comment added to
diff --git a/llvm/include/llvm/TextAPI/InterfaceFile.h b/llvm/include/llvm/TextAPI/InterfaceFile.h
index 10a37e3a0c2f..23c27cb0f474 100644
--- a/llvm/include/llvm/TextAPI/InterfaceFile.h
+++ b/llvm/include/llvm/TextAPI/InterfaceFile.h
@@ -299,9 +299,9 @@ public:
   }
 
   /// Set the runpath search paths.
-  /// \param InputTarget The target applicable to runpath search path.
   /// \param RPath The name of runpath.
-  void addRPath(const Target &InputTarget, StringRef RPath);
+  /// \param InputTarget The target applicable to runpath search path.
+  void addRPath(StringRef RPath, const Target &InputTarget);
 
   /// Get the list of runpath search paths.
   ///
diff --git a/llvm/include/llvm/Transforms/Instrumentation/RemoveTrapsPass.h b/llvm/include/llvm/Transforms/Instrumentation/RemoveTrapsPass.h
index 58f6bbcec5dc..bae15840f992 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/RemoveTrapsPass.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/RemoveTrapsPass.h
@@ -25,6 +25,8 @@ namespace llvm {
 class RemoveTrapsPass : public PassInfoMixin<RemoveTrapsPass> {
 public:
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+  static bool IsRequested();
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index b8bc81197c95..6cded828c25f 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -588,10 +588,14 @@ LazyValueInfoImpl::getBlockValue(Value *Val, BasicBlock *BB,
 
 static ValueLatticeElement getFromRangeMetadata(Instruction *BBI) {
   switch (BBI->getOpcode()) {
-  default: break;
-  case Instruction::Load:
+  default:
+    break;
   case Instruction::Call:
   case Instruction::Invoke:
+    if (std::optional<ConstantRange> Range = cast<CallBase>(BBI)->getRange())
+      return ValueLatticeElement::getRange(*Range);
+    [[fallthrough]];
+  case Instruction::Load:
     if (MDNode *Ranges = BBI->getMetadata(LLVMContext::MD_range))
       if (isa<IntegerType>(BBI->getType())) {
         return ValueLatticeElement::getRange(
@@ -706,10 +710,11 @@ std::optional<ValueLatticeElement>
 LazyValueInfoImpl::solveBlockValueNonLocal(Value *Val, BasicBlock *BB) {
   ValueLatticeElement Result;  // Start Undefined.
 
-  // If this is the entry block, we must be asking about an argument.  The
-  // value is overdefined.
+  // If this is the entry block, we must be asking about an argument.
   if (BB->isEntryBlock()) {
     assert(isa<Argument>(Val) && "Unknown live-in to the entry block");
+    if (std::optional<ConstantRange> Range = cast<Argument>(Val)->getRange())
+      return ValueLatticeElement::getRange(*Range);
     return ValueLatticeElement::getOverdefined();
   }
 
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 515b9d0744f6..e030b9fc7dac 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -944,10 +944,7 @@ static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
   // Calculate the multiplicative inverse of K! / 2^T;
   // this multiplication factor will perform the exact division by
   // K! / 2^T.
-  APInt Mod = APInt::getSignedMinValue(W+1);
-  APInt MultiplyFactor = OddFactorial.zext(W+1);
-  MultiplyFactor = MultiplyFactor.multiplicativeInverse(Mod);
-  MultiplyFactor = MultiplyFactor.trunc(W);
+  APInt MultiplyFactor = OddFactorial.multiplicativeInverse();
 
   // Calculate the product, at width T+W
   IntegerType *CalculationTy = IntegerType::get(SE.getContext(),
@@ -10086,10 +10083,8 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
   // If D == 1, (N / D) == N == 2^BW, so we need one extra bit to represent
   // (N / D) in general. The inverse itself always fits into BW bits, though,
   // so we immediately truncate it.
-  APInt AD = A.lshr(Mult2).zext(BW + 1);  // AD = A / D
-  APInt Mod(BW + 1, 0);
-  Mod.setBit(BW - Mult2);  // Mod = N / D
-  APInt I = AD.multiplicativeInverse(Mod).trunc(BW);
+  APInt AD = A.lshr(Mult2).trunc(BW - Mult2); // AD = A / D
+  APInt I = AD.multiplicativeInverse().zext(BW);
 
   // 4. Compute the minimum unsigned root of the equation:
   // I * (B / D) mod (N / D)
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index b5e8a1d22f26..5ad4da43bca7 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -648,6 +648,7 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred,
   auto m_V =
       m_CombineOr(m_Specific(V), m_PtrToIntSameSize(Q.DL, m_Specific(V)));
 
+  Value *Y;
   const APInt *Mask, *C;
   uint64_t ShAmt;
   switch (Pred) {
@@ -656,16 +657,18 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred,
     if (match(LHS, m_V) && match(RHS, m_APInt(C))) {
       Known = Known.unionWith(KnownBits::makeConstant(*C));
       // assume(V & Mask = C)
-    } else if (match(LHS, m_And(m_V, m_APInt(Mask))) &&
+    } else if (match(LHS, m_c_And(m_V, m_Value(Y))) &&
                match(RHS, m_APInt(C))) {
       // For one bits in Mask, we can propagate bits from C to V.
-      Known.Zero |= ~*C & *Mask;
-      Known.One |= *C & *Mask;
+      Known.One |= *C;
+      if (match(Y, m_APInt(Mask)))
+        Known.Zero |= ~*C & *Mask;
       // assume(V | Mask = C)
-    } else if (match(LHS, m_Or(m_V, m_APInt(Mask))) && match(RHS, m_APInt(C))) {
+    } else if (match(LHS, m_c_Or(m_V, m_Value(Y))) && match(RHS, m_APInt(C))) {
       // For zero bits in Mask, we can propagate bits from C to V.
-      Known.Zero |= ~*C & ~*Mask;
-      Known.One |= *C & ~*Mask;
+      Known.Zero |= ~*C;
+      if (match(Y, m_APInt(Mask)))
+        Known.One |= *C & ~*Mask;
       // assume(V ^ Mask = C)
     } else if (match(LHS, m_Xor(m_V, m_APInt(Mask))) &&
                match(RHS, m_APInt(C))) {
@@ -8390,8 +8393,7 @@ bool llvm::matchSimpleRecurrence(const BinaryOperator *I, PHINode *&P,
 
 /// Return true if "icmp Pred LHS RHS" is always true.
 static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
-                            const Value *RHS, const DataLayout &DL,
-                            unsigned Depth) {
+                            const Value *RHS) {
   if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS)
     return true;
 
@@ -8403,8 +8405,26 @@ static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
     const APInt *C;
 
     // LHS s<= LHS +_{nsw} C   if C >= 0
-    if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))))
+    // LHS s<= LHS | C         if C >= 0
+    if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))) ||
+        match(RHS, m_Or(m_Specific(LHS), m_APInt(C))))
       return !C->isNegative();
+
+    // LHS s<= smax(LHS, V) for any V
+    if (match(RHS, m_c_SMax(m_Specific(LHS), m_Value())))
+      return true;
+
+    // smin(RHS, V) s<= RHS for any V
+    if (match(LHS, m_c_SMin(m_Specific(RHS), m_Value())))
+      return true;
+
+    // Match A to (X +_{nsw} CA) and B to (X +_{nsw} CB)
+    const Value *X;
+    const APInt *CLHS, *CRHS;
+    if (match(LHS, m_NSWAddLike(m_Value(X), m_APInt(CLHS))) &&
+        match(RHS, m_NSWAddLike(m_Specific(X), m_APInt(CRHS))))
+      return CLHS->sle(*CRHS);
+
     return false;
   }
 
@@ -8414,34 +8434,36 @@ static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
         cast<OverflowingBinaryOperator>(RHS)->hasNoUnsignedWrap())
       return true;
 
+    // LHS u<= LHS | V for any V
+    if (match(RHS, m_c_Or(m_Specific(LHS), m_Value())))
+      return true;
+
+    // LHS u<= umax(LHS, V) for any V
+    if (match(RHS, m_c_UMax(m_Specific(LHS), m_Value())))
+      return true;
+
     // RHS >> V u<= RHS for any V
     if (match(LHS, m_LShr(m_Specific(RHS), m_Value())))
       return true;
 
-    // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
-    auto MatchNUWAddsToSameValue = [&](const Value *A, const Value *B,
-                                       const Value *&X,
-                                       const APInt *&CA, const APInt *&CB) {
-      if (match(A, m_NUWAdd(m_Value(X), m_APInt(CA))) &&
-          match(B, m_NUWAdd(m_Specific(X), m_APInt(CB))))
-        return true;
+    // RHS u/ C_ugt_1 u<= RHS
+    const APInt *C;
+    if (match(LHS, m_UDiv(m_Specific(RHS), m_APInt(C))) && C->ugt(1))
+      return true;
 
-      // If X & C == 0 then (X | C) == X +_{nuw} C
-      if (match(A, m_Or(m_Value(X), m_APInt(CA))) &&
-          match(B, m_Or(m_Specific(X), m_APInt(CB)))) {
-        KnownBits Known(CA->getBitWidth());
-        computeKnownBits(X, Known, DL, Depth + 1, /*AC*/ nullptr,
-                         /*CxtI*/ nullptr, /*DT*/ nullptr);
-        if (CA->isSubsetOf(Known.Zero) && CB->isSubsetOf(Known.Zero))
-          return true;
-      }
+    // RHS & V u<= RHS for any V
+    if (match(LHS, m_c_And(m_Specific(RHS), m_Value())))
+      return true;
 
-      return false;
-    };
+    // umin(RHS, V) u<= RHS for any V
+    if (match(LHS, m_c_UMin(m_Specific(RHS), m_Value())))
+      return true;
 
+    // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
     const Value *X;
     const APInt *CLHS, *CRHS;
-    if (MatchNUWAddsToSameValue(LHS, RHS, X, CLHS, CRHS))
+    if (match(LHS, m_NUWAddLike(m_Value(X), m_APInt(CLHS))) &&
+        match(RHS, m_NUWAddLike(m_Specific(X), m_APInt(CRHS))))
       return CLHS->ule(*CRHS);
 
     return false;
@@ -8453,37 +8475,36 @@ static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
 /// ALHS ARHS" is true.  Otherwise, return std::nullopt.
 static std::optional<bool>
 isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
-                      const Value *ARHS, const Value *BLHS, const Value *BRHS,
-                      const DataLayout &DL, unsigned Depth) {
+                      const Value *ARHS, const Value *BLHS, const Value *BRHS) {
   switch (Pred) {
   default:
     return std::nullopt;
 
   case CmpInst::ICMP_SLT:
   case CmpInst::ICMP_SLE:
-    if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS) &&
+        isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS))
       return true;
     return std::nullopt;
 
   case CmpInst::ICMP_SGT:
   case CmpInst::ICMP_SGE:
-    if (isTruePredicate(CmpInst::ICMP_SLE, ALHS, BLHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_SLE, BRHS, ARHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_SLE, ALHS, BLHS) &&
+        isTruePredicate(CmpInst::ICMP_SLE, BRHS, ARHS))
       return true;
     return std::nullopt;
 
   case CmpInst::ICMP_ULT:
   case CmpInst::ICMP_ULE:
-    if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS) &&
+        isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS))
       return true;
     return std::nullopt;
 
   case CmpInst::ICMP_UGT:
   case CmpInst::ICMP_UGE:
-    if (isTruePredicate(CmpInst::ICMP_ULE, ALHS, BLHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_ULE, BRHS, ARHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_ULE, ALHS, BLHS) &&
+        isTruePredicate(CmpInst::ICMP_ULE, BRHS, ARHS))
       return true;
     return std::nullopt;
   }
@@ -8527,7 +8548,7 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
                                               CmpInst::Predicate RPred,
                                               const Value *R0, const Value *R1,
                                               const DataLayout &DL,
-                                              bool LHSIsTrue, unsigned Depth) {
+                                              bool LHSIsTrue) {
   Value *L0 = LHS->getOperand(0);
   Value *L1 = LHS->getOperand(1);
 
@@ -8574,7 +8595,7 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
     return LPred == RPred;
 
   if (LPred == RPred)
-    return isImpliedCondOperands(LPred, L0, L1, R0, R1, DL, Depth);
+    return isImpliedCondOperands(LPred, L0, L1, R0, R1);
 
   return std::nullopt;
 }
@@ -8636,8 +8657,7 @@ llvm::isImpliedCondition(const Value *LHS, CmpInst::Predicate RHSPred,
   // Both LHS and RHS are icmps.
   const ICmpInst *LHSCmp = dyn_cast<ICmpInst>(LHS);
   if (LHSCmp)
-    return isImpliedCondICmps(LHSCmp, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue,
-                              Depth);
+    return isImpliedCondICmps(LHSCmp, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue);
 
   /// The LHS should be an 'or', 'and', or a 'select' instruction.  We expect
   /// the RHS to be an icmp.
@@ -9276,11 +9296,17 @@ void llvm::findValuesAffectedByCondition(
 
       if (ICmpInst::isEquality(Pred)) {
         if (match(B, m_ConstantInt())) {
+          Value *Y;
           // (X & C) or (X | C) or (X ^ C).
           // (X << C) or (X >>_s C) or (X >>_u C).
           if (match(A, m_BitwiseLogic(m_Value(X), m_ConstantInt())) ||
               match(A, m_Shift(m_Value(X), m_ConstantInt())))
             AddAffected(X);
+          else if (match(A, m_And(m_Value(X), m_Value(Y))) ||
+                   match(A, m_Or(m_Value(X), m_Value(Y)))) {
+            AddAffected(X);
+            AddAffected(Y);
+          }
         }
       } else {
         // Handle (A + C1) u< C2, which is the canonical form of
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index de2396f31f66..4f2486c963e9 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -21,19 +21,14 @@ using namespace llvm;
 extern bool WriteNewDbgInfoFormatToBitcode;
 
 PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
-  bool ConvertToOldDbgFormatForWrite =
-      M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat &&
+                                                WriteNewDbgInfoFormatToBitcode);
 
   const ModuleSummaryIndex *Index =
       EmitSummaryIndex ? &(AM.getResult<ModuleSummaryIndexAnalysis>(M))
                        : nullptr;
   WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, Index, EmitModuleHash);
 
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
 
@@ -57,16 +52,12 @@ namespace {
     StringRef getPassName() const override { return "Bitcode Writer"; }
 
     bool runOnModule(Module &M) override {
-      bool ConvertToOldDbgFormatForWrite =
-          M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
-      if (ConvertToOldDbgFormatForWrite)
-        M.convertFromNewDbgValues();
+      ScopedDbgInfoFormatSetter FormatSetter(
+          M, M.IsNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode);
 
       WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, /*Index=*/nullptr,
                          /*EmitModuleHash=*/false);
 
-      if (ConvertToOldDbgFormatForWrite)
-        M.convertToNewDbgValues();
       return false;
     }
     void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 062132c8304b..e53e35d98cec 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5201,10 +5201,7 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) {
 
     // Calculate the multiplicative inverse modulo BW.
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
-    unsigned W = Divisor.getBitWidth();
-    APInt Factor = Divisor.zext(W + 1)
-                       .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                       .trunc(W);
+    APInt Factor = Divisor.multiplicativeInverse();
     Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
     Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
     return true;
@@ -6276,15 +6273,14 @@ bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) {
 bool CombinerHelper::matchCommuteConstantToRHS(MachineInstr &MI) {
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
-  if (!getIConstantVRegVal(LHS, MRI)) {
-    // Skip commuting if LHS is not a constant. But, LHS may be a
-    // G_CONSTANT_FOLD_BARRIER. If so we commute as long as we don't already
-    // have a constant on the RHS.
-    if (MRI.getVRegDef(LHS)->getOpcode() !=
-        TargetOpcode::G_CONSTANT_FOLD_BARRIER)
-      return false;
-  }
-  // Commute as long as RHS is not a constant or G_CONSTANT_FOLD_BARRIER.
+  auto *LHSDef = MRI.getVRegDef(LHS);
+  if (getIConstantVRegVal(LHS, MRI).has_value())
+    return true;
+
+  // LHS may be a G_CONSTANT_FOLD_BARRIER. If so we commute
+  // as long as we don't already have a constant on the RHS.
+  if (LHSDef->getOpcode() != TargetOpcode::G_CONSTANT_FOLD_BARRIER)
+    return false;
   return MRI.getVRegDef(RHS)->getOpcode() !=
              TargetOpcode::G_CONSTANT_FOLD_BARRIER &&
          !getIConstantVRegVal(RHS, MRI);
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bbc6d39d17fc..bf3aee67ec00 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -69,6 +69,8 @@ static cl::opt<bool> SimplifyMIR(
 static cl::opt<bool> PrintLocations("mir-debug-loc", cl::Hidden, cl::init(true),
                                     cl::desc("Print MIR debug-locations"));
 
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 namespace {
 
 /// This structure describes how to print out stack object references.
@@ -986,29 +988,19 @@ void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
 }
 
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
-  // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
-  // in dbg.value format.
-  bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
-    const_cast<Module &>(M).convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(const_cast<Module &>(M),
+                                         WriteNewDbgInfoFormat);
 
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
-
-  if (IsNewDbgInfoFormat)
-    const_cast<Module &>(M).convertToNewDbgValues();
 }
 
 void llvm::printMIR(raw_ostream &OS, const MachineFunction &MF) {
   // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
   // in dbg.value format.
-  bool IsNewDbgInfoFormat = MF.getFunction().IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
-    const_cast<Function &>(MF.getFunction()).convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(
+      const_cast<Function &>(MF.getFunction()), WriteNewDbgInfoFormat);
 
   MIRPrinter Printer(OS);
   Printer.print(MF);
-
-  if (IsNewDbgInfoFormat)
-    const_cast<Function &>(MF.getFunction()).convertToNewDbgValues();
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 28fe0697357d..f20080c8f108 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1164,19 +1164,20 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
   SDValue N01 = N0.getOperand(1);
 
   if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
+    SDNodeFlags NewFlags;
+    if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
+        Flags.hasNoUnsignedWrap())
+      NewFlags.setNoUnsignedWrap(true);
+
     if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
       // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
       if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
-        return DAG.getNode(Opc, DL, VT, N00, OpNode);
+        return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags);
       return SDValue();
     }
     if (TLI.isReassocProfitable(DAG, N0, N1)) {
       // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
       //              iff (op x, c1) has one use
-      SDNodeFlags NewFlags;
-      if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
-          Flags.hasNoUnsignedWrap())
-        NewFlags.setNoUnsignedWrap(true);
       SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1, NewFlags);
       return DAG.getNode(Opc, DL, VT, OpNode, N01, NewFlags);
     }
@@ -22265,12 +22266,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
     return DAG.getUNDEF(ScalarVT);
 
-  // extract_vector_elt(freeze(x)), idx -> freeze(extract_vector_elt(x)), idx
-  if (VecOp.hasOneUse() && VecOp.getOpcode() == ISD::FREEZE) {
-    return DAG.getFreeze(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
-                                     VecOp.getOperand(0), Index));
-  }
-
   // extract_vector_elt (build_vector x, y), 1 -> y
   if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
        VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 3332c02ec723..a8b1f41ee40d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -53,6 +53,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   SDValue R = SDValue();
 
   switch (N->getOpcode()) {
+    // clang-format off
   default:
 #ifndef NDEBUG
     dbgs() << "SoftenFloatResult #" << ResNo << ": ";
@@ -115,9 +116,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FPOWI:
     case ISD::FLDEXP:
     case ISD::STRICT_FLDEXP: R = SoftenFloatRes_ExpOp(N); break;
-    case ISD::FFREXP:
-      R = SoftenFloatRes_FFREXP(N);
-      break;
+    case ISD::FFREXP:        R = SoftenFloatRes_FFREXP(N); break;
     case ISD::STRICT_FREM:
     case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
     case ISD::STRICT_FRINT:
@@ -150,14 +149,11 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::VECREDUCE_FMIN:
     case ISD::VECREDUCE_FMAX:
     case ISD::VECREDUCE_FMAXIMUM:
-    case ISD::VECREDUCE_FMINIMUM:
-      R = SoftenFloatRes_VECREDUCE(N);
-      break;
+    case ISD::VECREDUCE_FMINIMUM: R = SoftenFloatRes_VECREDUCE(N); break;
     case ISD::VECREDUCE_SEQ_FADD:
-    case ISD::VECREDUCE_SEQ_FMUL:
-      R = SoftenFloatRes_VECREDUCE_SEQ(N);
-      break;
-  }
+    case ISD::VECREDUCE_SEQ_FMUL: R = SoftenFloatRes_VECREDUCE_SEQ(N); break;
+      // clang-format on
+    }
 
   // If R is null, the sub-method took care of registering the result.
   if (R.getNode()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8c543ae8038a..1dd0fa49a460 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5149,6 +5149,17 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::OR:
     return ConsiderFlags && Op->getFlags().hasDisjoint();
 
+  case ISD::SCALAR_TO_VECTOR:
+    // Check if we demand any upper (undef) elements.
+    return !PoisonOnly && DemandedElts.ugt(1);
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Ensure that the element index is in bounds.
+    EVT VecVT = Op.getOperand(0).getValueType();
+    KnownBits KnownIdx = computeKnownBits(Op.getOperand(1), Depth + 1);
+    return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
+  }
+
   case ISD::INSERT_VECTOR_ELT:{
     // Ensure that the element index is in bounds.
     EVT VecVT = Op.getOperand(0).getValueType();
@@ -11545,30 +11556,32 @@ bool llvm::isNeutralConstant(unsigned Opcode, SDNodeFlags Flags, SDValue V,
                              unsigned OperandNo) {
   // NOTE: The cases should match with IR's ConstantExpr::getBinOpIdentity().
   // TODO: Target-specific opcodes could be added.
-  if (auto *Const = isConstOrConstSplat(V)) {
+  if (auto *ConstV = isConstOrConstSplat(V, /*AllowUndefs*/ false,
+                                         /*AllowTruncation*/ true)) {
+    APInt Const = ConstV->getAPIntValue().trunc(V.getScalarValueSizeInBits());
     switch (Opcode) {
     case ISD::ADD:
     case ISD::OR:
     case ISD::XOR:
     case ISD::UMAX:
-      return Const->isZero();
+      return Const.isZero();
     case ISD::MUL:
-      return Const->isOne();
+      return Const.isOne();
     case ISD::AND:
     case ISD::UMIN:
-      return Const->isAllOnes();
+      return Const.isAllOnes();
     case ISD::SMAX:
-      return Const->isMinSignedValue();
+      return Const.isMinSignedValue();
     case ISD::SMIN:
-      return Const->isMaxSignedValue();
+      return Const.isMaxSignedValue();
     case ISD::SUB:
     case ISD::SHL:
     case ISD::SRA:
     case ISD::SRL:
-      return OperandNo == 1 && Const->isZero();
+      return OperandNo == 1 && Const.isZero();
     case ISD::UDIV:
     case ISD::SDIV:
-      return OperandNo == 1 && Const->isOne();
+      return OperandNo == 1 && Const.isOne();
     }
   } else if (auto *ConstFP = isConstOrConstSplatFP(V)) {
     switch (Opcode) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8bb9541bfe10..409d66adfd67 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -742,6 +742,13 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
 
     break;
   }
+  case ISD::FREEZE: {
+    SDValue N0 = Op.getOperand(0);
+    if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
+                                             /*PoisonOnly=*/false))
+      return N0;
+    break;
+  }
   case ISD::AND: {
     LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -3184,6 +3191,20 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     }
     break;
   }
+  case ISD::FREEZE: {
+    SDValue N0 = Op.getOperand(0);
+    if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
+                                                 /*PoisonOnly=*/false))
+      return TLO.CombineTo(Op, N0);
+
+    // TODO: Replace this with the general fold from DAGCombiner::visitFREEZE
+    // freeze(op(x, ...)) -> op(freeze(x), ...).
+    if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && DemandedElts == 1)
+      return TLO.CombineTo(
+          Op, TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
+                              TLO.DAG.getFreeze(N0.getOperand(0))));
+    break;
+  }
   case ISD::BUILD_VECTOR: {
     // Check all elements and simplify any unused elements with UNDEF.
     if (!DemandedElts.isAllOnes()) {
@@ -6050,11 +6071,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
       Divisor.ashrInPlace(Shift);
       UseSRA = true;
     }
-    // Calculate the multiplicative inverse, using Newton's method.
-    APInt t;
-    APInt Factor = Divisor;
-    while ((t = Divisor * Factor) != 1)
-      Factor *= APInt(Divisor.getBitWidth(), 2) - t;
+    APInt Factor = Divisor.multiplicativeInverse();
     Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
     Factors.push_back(DAG.getConstant(Factor, dl, SVT));
     return true;
@@ -6643,10 +6660,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
     // P = inv(D0, 2^W)
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
     unsigned W = D.getBitWidth();
-    APInt P = D0.zext(W + 1)
-                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                  .trunc(W);
-    assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
+    APInt P = D0.multiplicativeInverse();
     assert((D0 * P).isOne() && "Multiplicative inverse basic check failed.");
 
     // Q = floor((2^W - 1) u/ D)
@@ -6901,10 +6915,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
     // P = inv(D0, 2^W)
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
     unsigned W = D.getBitWidth();
-    APInt P = D0.zext(W + 1)
-                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                  .trunc(W);
-    assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
+    APInt P = D0.multiplicativeInverse();
     assert((D0 * P).isOne() && "Multiplicative inverse basic check failed.");
 
     // A = floor((2^(W - 1) - 1) / D0) & -2^K
@@ -7630,7 +7641,7 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 //
 // For division, we can compute the remainder using the algorithm described
 // above, subtract it from the dividend to get an exact multiple of Constant.
-// Then multiply that extact multiply by the multiplicative inverse modulo
+// Then multiply that exact multiply by the multiplicative inverse modulo
 // (1 << (BitWidth / 2)) to get the quotient.
 
 // If Constant is even, we can shift right the dividend and the divisor by the
@@ -7765,10 +7776,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
 
     // Multiply by the multiplicative inverse of the divisor modulo
     // (1 << BitWidth).
-    APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
-    APInt MulFactor = Divisor.zext(BitWidth + 1);
-    MulFactor = MulFactor.multiplicativeInverse(Mod);
-    MulFactor = MulFactor.trunc(BitWidth);
+    APInt MulFactor = Divisor.multiplicativeInverse();
 
     SDValue Quotient = DAG.getNode(ISD::MUL, dl, VT, Dividend,
                                    DAG.getConstant(MulFactor, dl, VT));
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index a44f6af4162f..0f8c984d5e3c 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -983,7 +983,8 @@ static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
   return Intrinsic::not_intrinsic;
 }
 
-static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
+static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
+                                      bool CanUpgradeDebugIntrinsicsToRecords) {
   assert(F && "Illegal to upgrade a non-existent Function.");
 
   StringRef Name = F->getName();
@@ -1057,7 +1058,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   case 'd':
     if (Name.consume_front("dbg.")) {
       // Mark debug intrinsics for upgrade to new debug format.
-      if (F->getParent()->IsNewDbgInfoFormat) {
+      if (CanUpgradeDebugIntrinsicsToRecords &&
+          F->getParent()->IsNewDbgInfoFormat) {
         if (Name == "addr" || Name == "value" || Name == "assign" ||
             Name == "declare" || Name == "label") {
           // There's no function to replace these with.
@@ -1413,9 +1415,11 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   return false;
 }
 
-bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
+bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn,
+                                    bool CanUpgradeDebugIntrinsicsToRecords) {
   NewFn = nullptr;
-  bool Upgraded = upgradeIntrinsicFunction1(F, NewFn);
+  bool Upgraded =
+      upgradeIntrinsicFunction1(F, NewFn, CanUpgradeDebugIntrinsicsToRecords);
   assert(F != NewFn && "Intrinsic function upgraded to the same function");
 
   // Upgrade intrinsic attributes.  This does not change the function.
@@ -2412,6 +2416,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   Builder.SetInsertPoint(CI->getParent(), CI->getIterator());
 
   if (!NewFn) {
+    bool FallthroughToDefaultUpgrade = false;
     // Get the Function's name.
     StringRef Name = F->getName();
 
@@ -4262,16 +4267,30 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = upgradeARMIntrinsicCall(Name, CI, F, Builder);
     } else if (IsAMDGCN) {
       Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
-    } else if (IsDbg && CI->getModule()->IsNewDbgInfoFormat) {
-      upgradeDbgIntrinsicToDbgRecord(Name, CI);
+    } else if (IsDbg) {
+      // We might have decided we don't want the new format after all between
+      // first requesting the upgrade and now; skip the conversion if that is
+      // the case, and check here to see if the intrinsic needs to be upgraded
+      // normally.
+      if (!CI->getModule()->IsNewDbgInfoFormat) {
+        bool NeedsUpgrade =
+            upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false);
+        if (!NeedsUpgrade)
+          return;
+        FallthroughToDefaultUpgrade = true;
+      } else {
+        upgradeDbgIntrinsicToDbgRecord(Name, CI);
+      }
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
 
-    if (Rep)
-      CI->replaceAllUsesWith(Rep);
-    CI->eraseFromParent();
-    return;
+    if (!FallthroughToDefaultUpgrade) {
+      if (Rep)
+        CI->replaceAllUsesWith(Rep);
+      CI->eraseFromParent();
+      return;
+    }
   }
 
   const auto &DefaultCase = [&]() -> void {
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 953f21ce7405..d361bd9a9839 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -531,9 +531,7 @@ bool PassManagerImpl::run(Module &M) {
   // RemoveDIs: if a command line flag is given, convert to the
   // DbgVariableRecord representation of debug-info for the duration of these
   // passes.
-  bool shouldConvertDbgInfo = UseNewDbgInfoFormat && !M.IsNewDbgInfoFormat;
-  if (shouldConvertDbgInfo)
-    M.convertToNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(M, UseNewDbgInfoFormat);
 
   for (ImmutablePass *ImPass : getImmutablePasses())
     Changed |= ImPass->doInitialization(M);
@@ -547,9 +545,6 @@ bool PassManagerImpl::run(Module &M) {
   for (ImmutablePass *ImPass : getImmutablePasses())
     Changed |= ImPass->doFinalization(M);
 
-  if (shouldConvertDbgInfo)
-    M.convertFromNewDbgValues();
-
   return Changed;
 }
 } // namespace legacy
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 33f358440a31..64c59914cf2f 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1734,8 +1734,28 @@ void Verifier::visitModuleFlags() {
   // Scan each flag, and track the flags and requirements.
   DenseMap<const MDString*, const MDNode*> SeenIDs;
   SmallVector<const MDNode*, 16> Requirements;
-  for (const MDNode *MDN : Flags->operands())
+  uint64_t PAuthABIPlatform = -1;
+  uint64_t PAuthABIVersion = -1;
+  for (const MDNode *MDN : Flags->operands()) {
     visitModuleFlag(MDN, SeenIDs, Requirements);
+    if (MDN->getNumOperands() != 3)
+      continue;
+    if (const auto *FlagName = dyn_cast_or_null<MDString>(MDN->getOperand(1))) {
+      if (FlagName->getString() == "aarch64-elf-pauthabi-platform") {
+        if (const auto *PAP =
+                mdconst::dyn_extract_or_null<ConstantInt>(MDN->getOperand(2)))
+          PAuthABIPlatform = PAP->getZExtValue();
+      } else if (FlagName->getString() == "aarch64-elf-pauthabi-version") {
+        if (const auto *PAV =
+                mdconst::dyn_extract_or_null<ConstantInt>(MDN->getOperand(2)))
+          PAuthABIVersion = PAV->getZExtValue();
+      }
+    }
+  }
+
+  if ((PAuthABIPlatform == uint64_t(-1)) != (PAuthABIVersion == uint64_t(-1)))
+    CheckFailed("either both or no 'aarch64-elf-pauthabi-platform' and "
+                "'aarch64-elf-pauthabi-version' module flags must be present");
 
   // Validate that the requirements in the module are valid.
   for (const MDNode *Requirement : Requirements) {
@@ -4343,6 +4363,11 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
     if (auto *II = dyn_cast<InvokeInst>(TI)) {
       Check(II->getUnwindDest() == BB && II->getNormalDest() != BB,
             "EH pad must be jumped to via an unwind edge", ToPad, II);
+      auto *CalledFn =
+          dyn_cast<Function>(II->getCalledOperand()->stripPointerCasts());
+      if (CalledFn && CalledFn->isIntrinsic() && II->doesNotThrow() &&
+          !IntrinsicInst::mayLowerToFunctionCall(CalledFn->getIntrinsicID()))
+        continue;
       if (auto Bundle = II->getOperandBundle(LLVMContext::OB_funclet))
         FromPad = Bundle->Inputs[0];
       else
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index a7e6db82e5c2..7a5aa0c80478 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1548,25 +1548,10 @@ Error IRLinker::run() {
       return Err;
 
   // Convert source module to match dest for the duration of the link.
-  bool SrcModuleNewDbgFormat = SrcM->IsNewDbgInfoFormat;
-  if (DstM.IsNewDbgInfoFormat != SrcM->IsNewDbgInfoFormat) {
-    if (DstM.IsNewDbgInfoFormat)
-      SrcM->convertToNewDbgValues();
-    else
-      SrcM->convertFromNewDbgValues();
-  }
-  // Undo debug mode conversion afterwards.
-  auto Cleanup = make_scope_exit([&]() {
-    if (SrcModuleNewDbgFormat != SrcM->IsNewDbgInfoFormat) {
-      if (SrcModuleNewDbgFormat)
-        SrcM->convertToNewDbgValues();
-      else
-        SrcM->convertFromNewDbgValues();
-    }
-  });
+  ScopedDbgInfoFormatSetter FormatSetter(*SrcM, DstM.IsNewDbgInfoFormat);
 
-  // Inherit the target data from the source module if the destination module
-  // doesn't have one already.
+  // Inherit the target data from the source module if the destination
+  // module doesn't have one already.
   if (DstM.getDataLayout().isDefault())
     DstM.setDataLayout(SrcM->getDataLayout());
 
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 205bc1ef5b1a..f343d1447e05 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -215,23 +215,41 @@ static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
 }
 
 Error Object::compressOrDecompressSections(const CommonConfig &Config) {
-  // Build a list of the debug sections we are going to replace.
-  // We can't call `AddSection` while iterating over sections,
+  // Build a list of sections we are going to replace.
+  // We can't call `addSection` while iterating over sections,
   // because it would mutate the sections array.
   SmallVector<std::pair<SectionBase *, std::function<SectionBase *()>>, 0>
       ToReplace;
   for (SectionBase &Sec : sections()) {
-    if ((Sec.Flags & SHF_ALLOC) || !StringRef(Sec.Name).starts_with(".debug"))
+    std::optional<DebugCompressionType> CType;
+    for (auto &[Matcher, T] : Config.compressSections)
+      if (Matcher.matches(Sec.Name))
+        CType = T;
+    // Handle --compress-debug-sections and --decompress-debug-sections, which
+    // apply to non-ALLOC debug sections.
+    if (!(Sec.Flags & SHF_ALLOC) && StringRef(Sec.Name).starts_with(".debug")) {
+      if (Config.CompressionType != DebugCompressionType::None)
+        CType = Config.CompressionType;
+      else if (Config.DecompressDebugSections)
+        CType = DebugCompressionType::None;
+    }
+    if (!CType)
       continue;
+
+    if (Sec.ParentSegment)
+      return createStringError(
+          errc::invalid_argument,
+          "section '" + Sec.Name +
+              "' within a segment cannot be (de)compressed");
+
     if (auto *CS = dyn_cast<CompressedSection>(&Sec)) {
-      if (Config.DecompressDebugSections) {
+      if (*CType == DebugCompressionType::None)
         ToReplace.emplace_back(
             &Sec, [=] { return &addSection<DecompressedSection>(*CS); });
-      }
-    } else if (Config.CompressionType != DebugCompressionType::None) {
-      ToReplace.emplace_back(&Sec, [&, S = &Sec] {
+    } else if (*CType != DebugCompressionType::None) {
+      ToReplace.emplace_back(&Sec, [=, S = &Sec] {
         return &addSection<CompressedSection>(
-            CompressedSection(*S, Config.CompressionType, Is64Bits));
+            CompressedSection(*S, *CType, Is64Bits));
       });
     }
   }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 7ac5c561dc08..884334ed070e 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1303,7 +1303,7 @@ Error IndexedInstrProfReader::readHeader() {
     MemProfRecordTable.reset(MemProfRecordHashTable::Create(
         /*Buckets=*/Start + RecordTableOffset,
         /*Payload=*/Ptr,
-        /*Base=*/Start, memprof::RecordLookupTrait(Schema)));
+        /*Base=*/Start, memprof::RecordLookupTrait(memprof::Version1, Schema)));
 
     // Initialize the frame table reader with the payload and bucket offsets.
     MemProfFrameTable.reset(MemProfFrameHashTable::Create(
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index c2c94ba30c65..72d77d5580a5 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -414,6 +414,144 @@ static void setSummary(IndexedInstrProf::Summary *TheSummary,
     TheSummary->setEntry(I, Res[I]);
 }
 
+// Serialize Schema.
+static void writeMemProfSchema(ProfOStream &OS,
+                               const memprof::MemProfSchema &Schema) {
+  OS.write(static_cast<uint64_t>(Schema.size()));
+  for (const auto Id : Schema)
+    OS.write(static_cast<uint64_t>(Id));
+}
+
+// Serialize MemProfRecordData.  Return RecordTableOffset.
+static uint64_t writeMemProfRecords(
+    ProfOStream &OS,
+    llvm::MapVector<GlobalValue::GUID, memprof::IndexedMemProfRecord>
+        &MemProfRecordData,
+    memprof::MemProfSchema *Schema) {
+  auto RecordWriter =
+      std::make_unique<memprof::RecordWriterTrait>(memprof::Version1);
+  RecordWriter->Schema = Schema;
+  OnDiskChainedHashTableGenerator<memprof::RecordWriterTrait>
+      RecordTableGenerator;
+  for (auto &I : MemProfRecordData) {
+    // Insert the key (func hash) and value (memprof record).
+    RecordTableGenerator.insert(I.first, I.second, *RecordWriter.get());
+  }
+  // Release the memory of this MapVector as it is no longer needed.
+  MemProfRecordData.clear();
+
+  // The call to Emit invokes RecordWriterTrait::EmitData which destructs
+  // the memprof record copies owned by the RecordTableGenerator. This works
+  // because the RecordTableGenerator is not used after this point.
+  return RecordTableGenerator.Emit(OS.OS, *RecordWriter);
+}
+
+// Serialize MemProfFrameData.  Return FrameTableOffset.
+static uint64_t writeMemProfFrames(
+    ProfOStream &OS,
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
+  auto FrameWriter = std::make_unique<memprof::FrameWriterTrait>();
+  OnDiskChainedHashTableGenerator<memprof::FrameWriterTrait>
+      FrameTableGenerator;
+  for (auto &I : MemProfFrameData) {
+    // Insert the key (frame id) and value (frame contents).
+    FrameTableGenerator.insert(I.first, I.second);
+  }
+  // Release the memory of this MapVector as it is no longer needed.
+  MemProfFrameData.clear();
+
+  return FrameTableGenerator.Emit(OS.OS, *FrameWriter);
+}
+
+static Error writeMemProfV0(
+    ProfOStream &OS,
+    llvm::MapVector<GlobalValue::GUID, memprof::IndexedMemProfRecord>
+        &MemProfRecordData,
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
+  uint64_t HeaderUpdatePos = OS.tell();
+  OS.write(0ULL); // Reserve space for the memprof record table offset.
+  OS.write(0ULL); // Reserve space for the memprof frame payload offset.
+  OS.write(0ULL); // Reserve space for the memprof frame table offset.
+
+  auto Schema = memprof::PortableMemInfoBlock::getSchema();
+  writeMemProfSchema(OS, Schema);
+
+  uint64_t RecordTableOffset =
+      writeMemProfRecords(OS, MemProfRecordData, &Schema);
+
+  uint64_t FramePayloadOffset = OS.tell();
+  uint64_t FrameTableOffset = writeMemProfFrames(OS, MemProfFrameData);
+
+  uint64_t Header[] = {RecordTableOffset, FramePayloadOffset, FrameTableOffset};
+  OS.patch({{HeaderUpdatePos, Header, std::size(Header)}});
+
+  return Error::success();
+}
+
+static Error writeMemProfV1(
+    ProfOStream &OS,
+    llvm::MapVector<GlobalValue::GUID, memprof::IndexedMemProfRecord>
+        &MemProfRecordData,
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
+  OS.write(memprof::Version0);
+  uint64_t HeaderUpdatePos = OS.tell();
+  OS.write(0ULL); // Reserve space for the memprof record table offset.
+  OS.write(0ULL); // Reserve space for the memprof frame payload offset.
+  OS.write(0ULL); // Reserve space for the memprof frame table offset.
+
+  auto Schema = memprof::PortableMemInfoBlock::getSchema();
+  writeMemProfSchema(OS, Schema);
+
+  uint64_t RecordTableOffset =
+      writeMemProfRecords(OS, MemProfRecordData, &Schema);
+
+  uint64_t FramePayloadOffset = OS.tell();
+  uint64_t FrameTableOffset = writeMemProfFrames(OS, MemProfFrameData);
+
+  uint64_t Header[] = {RecordTableOffset, FramePayloadOffset, FrameTableOffset};
+  OS.patch({{HeaderUpdatePos, Header, std::size(Header)}});
+
+  return Error::success();
+}
+
+// The MemProf profile data includes a simple schema
+// with the format described below followed by the hashtable:
+// uint64_t Version
+// uint64_t RecordTableOffset = RecordTableGenerator.Emit
+// uint64_t FramePayloadOffset = Stream offset before emitting the frame table
+// uint64_t FrameTableOffset = FrameTableGenerator.Emit
+// uint64_t Num schema entries
+// uint64_t Schema entry 0
+// uint64_t Schema entry 1
+// ....
+// uint64_t Schema entry N - 1
+// OnDiskChainedHashTable MemProfRecordData
+// OnDiskChainedHashTable MemProfFrameData
+static Error writeMemProf(
+    ProfOStream &OS,
+    llvm::MapVector<GlobalValue::GUID, memprof::IndexedMemProfRecord>
+        &MemProfRecordData,
+    llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData,
+    memprof::IndexedVersion MemProfVersionRequested) {
+
+  switch (MemProfVersionRequested) {
+  case memprof::Version0:
+    return writeMemProfV0(OS, MemProfRecordData, MemProfFrameData);
+  case memprof::Version1:
+    return writeMemProfV1(OS, MemProfRecordData, MemProfFrameData);
+  case memprof::Version2:
+    // TODO: Implement.  Fall through to the error handling below for now.
+    break;
+  }
+
+  return make_error<InstrProfError>(
+      instrprof_error::unsupported_version,
+      formatv("MemProf version {} not supported; "
+              "requires version between {} and {}, inclusive",
+              MemProfVersionRequested, memprof::MinimumSupportedVersion,
+              memprof::MaximumSupportedVersion));
+}
+
 Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   using namespace IndexedInstrProf;
   using namespace support;
@@ -517,84 +655,13 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // Write the hash table.
   uint64_t HashTableStart = Generator.Emit(OS.OS, *InfoObj);
 
-  // Write the MemProf profile data if we have it. This includes a simple schema
-  // with the format described below followed by the hashtable:
-  // uint64_t Version
-  // uint64_t RecordTableOffset = RecordTableGenerator.Emit
-  // uint64_t FramePayloadOffset = Stream offset before emitting the frame table
-  // uint64_t FrameTableOffset = FrameTableGenerator.Emit
-  // uint64_t Num schema entries
-  // uint64_t Schema entry 0
-  // uint64_t Schema entry 1
-  // ....
-  // uint64_t Schema entry N - 1
-  // OnDiskChainedHashTable MemProfRecordData
-  // OnDiskChainedHashTable MemProfFrameData
+  // Write the MemProf profile data if we have it.
   uint64_t MemProfSectionStart = 0;
   if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf)) {
-    if (MemProfVersionRequested < memprof::MinimumSupportedVersion ||
-        MemProfVersionRequested > memprof::MaximumSupportedVersion) {
-      return make_error<InstrProfError>(
-          instrprof_error::unsupported_version,
-          formatv("MemProf version {} not supported; "
-                  "requires version between {} and {}, inclusive",
-                  MemProfVersionRequested, memprof::MinimumSupportedVersion,
-                  memprof::MaximumSupportedVersion));
-    }
-
     MemProfSectionStart = OS.tell();
-
-    if (MemProfVersionRequested >= memprof::Version1)
-      OS.write(MemProfVersionRequested);
-
-    OS.write(0ULL); // Reserve space for the memprof record table offset.
-    OS.write(0ULL); // Reserve space for the memprof frame payload offset.
-    OS.write(0ULL); // Reserve space for the memprof frame table offset.
-
-    auto Schema = memprof::PortableMemInfoBlock::getSchema();
-    OS.write(static_cast<uint64_t>(Schema.size()));
-    for (const auto Id : Schema) {
-      OS.write(static_cast<uint64_t>(Id));
-    }
-
-    auto RecordWriter = std::make_unique<memprof::RecordWriterTrait>();
-    RecordWriter->Schema = &Schema;
-    OnDiskChainedHashTableGenerator<memprof::RecordWriterTrait>
-        RecordTableGenerator;
-    for (auto &I : MemProfRecordData) {
-      // Insert the key (func hash) and value (memprof record).
-      RecordTableGenerator.insert(I.first, I.second);
-    }
-    // Release the memory of this MapVector as it is no longer needed.
-    MemProfRecordData.clear();
-
-    // The call to Emit invokes RecordWriterTrait::EmitData which destructs
-    // the memprof record copies owned by the RecordTableGenerator. This works
-    // because the RecordTableGenerator is not used after this point.
-    uint64_t RecordTableOffset =
-        RecordTableGenerator.Emit(OS.OS, *RecordWriter);
-
-    uint64_t FramePayloadOffset = OS.tell();
-
-    auto FrameWriter = std::make_unique<memprof::FrameWriterTrait>();
-    OnDiskChainedHashTableGenerator<memprof::FrameWriterTrait>
-        FrameTableGenerator;
-    for (auto &I : MemProfFrameData) {
-      // Insert the key (frame id) and value (frame contents).
-      FrameTableGenerator.insert(I.first, I.second);
-    }
-    // Release the memory of this MapVector as it is no longer needed.
-    MemProfFrameData.clear();
-
-    uint64_t FrameTableOffset = FrameTableGenerator.Emit(OS.OS, *FrameWriter);
-
-    uint64_t Header[] = {RecordTableOffset, FramePayloadOffset,
-                         FrameTableOffset};
-    uint64_t HeaderUpdatePos = MemProfSectionStart;
-    if (MemProfVersionRequested >= memprof::Version1)
-      // The updates go just after the version field.
-      HeaderUpdatePos += sizeof(uint64_t);
-    OS.patch({{HeaderUpdatePos, Header, std::size(Header)}});
+    if (auto E = writeMemProf(OS, MemProfRecordData, MemProfFrameData,
+                              MemProfVersionRequested))
+      return E;
   }
 
   // BinaryIdSection has two parts:
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 6c419811d59e..ac0a8702c3f9 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -10,15 +10,88 @@
 
 namespace llvm {
 namespace memprof {
+namespace {
+size_t serializedSizeV0(const IndexedAllocationInfo &IAI) {
+  size_t Size = 0;
+  // The number of frames to serialize.
+  Size += sizeof(uint64_t);
+  // The callstack frame ids.
+  Size += sizeof(FrameId) * IAI.CallStack.size();
+  // The size of the payload.
+  Size += PortableMemInfoBlock::serializedSize();
+  return Size;
+}
 
-void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
-                                     raw_ostream &OS) {
+size_t serializedSizeV2(const IndexedAllocationInfo &IAI) {
+  size_t Size = 0;
+  // The CallStackId
+  Size += sizeof(CallStackId);
+  // The size of the payload.
+  Size += PortableMemInfoBlock::serializedSize();
+  return Size;
+}
+} // namespace
+
+size_t IndexedAllocationInfo::serializedSize(IndexedVersion Version) const {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    return serializedSizeV0(*this);
+  case Version2:
+    return serializedSizeV2(*this);
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
+namespace {
+size_t serializedSizeV0(const IndexedMemProfRecord &Record) {
+  size_t Result = sizeof(GlobalValue::GUID);
+  for (const IndexedAllocationInfo &N : Record.AllocSites)
+    Result += N.serializedSize(Version0);
+
+  // The number of callsites we have information for.
+  Result += sizeof(uint64_t);
+  for (const auto &Frames : Record.CallSites) {
+    // The number of frame ids to serialize.
+    Result += sizeof(uint64_t);
+    Result += Frames.size() * sizeof(FrameId);
+  }
+  return Result;
+}
+
+size_t serializedSizeV2(const IndexedMemProfRecord &Record) {
+  size_t Result = sizeof(GlobalValue::GUID);
+  for (const IndexedAllocationInfo &N : Record.AllocSites)
+    Result += N.serializedSize(Version2);
+
+  // The number of callsites we have information for.
+  Result += sizeof(uint64_t);
+  // The CallStackId
+  Result += Record.CallSiteIds.size() * sizeof(CallStackId);
+  return Result;
+}
+} // namespace
+
+size_t IndexedMemProfRecord::serializedSize(IndexedVersion Version) const {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    return serializedSizeV0(*this);
+  case Version2:
+    return serializedSizeV2(*this);
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
+namespace {
+void serializeV0(const IndexedMemProfRecord &Record,
+                 const MemProfSchema &Schema, raw_ostream &OS) {
   using namespace support;
 
   endian::Writer LE(OS, llvm::endianness::little);
 
-  LE.write<uint64_t>(AllocSites.size());
-  for (const IndexedAllocationInfo &N : AllocSites) {
+  LE.write<uint64_t>(Record.AllocSites.size());
+  for (const IndexedAllocationInfo &N : Record.AllocSites) {
     LE.write<uint64_t>(N.CallStack.size());
     for (const FrameId &Id : N.CallStack)
       LE.write<FrameId>(Id);
@@ -26,17 +99,50 @@ void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
   }
 
   // Related contexts.
-  LE.write<uint64_t>(CallSites.size());
-  for (const auto &Frames : CallSites) {
+  LE.write<uint64_t>(Record.CallSites.size());
+  for (const auto &Frames : Record.CallSites) {
     LE.write<uint64_t>(Frames.size());
     for (const FrameId &Id : Frames)
       LE.write<FrameId>(Id);
   }
 }
 
-IndexedMemProfRecord
-IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
-                                  const unsigned char *Ptr) {
+void serializeV2(const IndexedMemProfRecord &Record,
+                 const MemProfSchema &Schema, raw_ostream &OS) {
+  using namespace support;
+
+  endian::Writer LE(OS, llvm::endianness::little);
+
+  LE.write<uint64_t>(Record.AllocSites.size());
+  for (const IndexedAllocationInfo &N : Record.AllocSites) {
+    LE.write<CallStackId>(N.CSId);
+    N.Info.serialize(Schema, OS);
+  }
+
+  // Related contexts.
+  LE.write<uint64_t>(Record.CallSiteIds.size());
+  for (const auto &CSId : Record.CallSiteIds)
+    LE.write<CallStackId>(CSId);
+}
+} // namespace
+
+void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
+                                     raw_ostream &OS, IndexedVersion Version) {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    serializeV0(*this, Schema, OS);
+    return;
+  case Version2:
+    serializeV2(*this, Schema, OS);
+    return;
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
+namespace {
+IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema,
+                                   const unsigned char *Ptr) {
   using namespace support;
 
   IndexedMemProfRecord Record;
@@ -73,11 +179,57 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
       Frames.push_back(Id);
     }
     Record.CallSites.push_back(Frames);
+    Record.CallSiteIds.push_back(hashCallStack(Frames));
   }
 
   return Record;
 }
 
+IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
+                                   const unsigned char *Ptr) {
+  using namespace support;
+
+  IndexedMemProfRecord Record;
+
+  // Read the meminfo nodes.
+  const uint64_t NumNodes =
+      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+  for (uint64_t I = 0; I < NumNodes; I++) {
+    IndexedAllocationInfo Node;
+    Node.CSId =
+        endian::readNext<CallStackId, llvm::endianness::little, unaligned>(Ptr);
+    Node.Info.deserialize(Schema, Ptr);
+    Ptr += PortableMemInfoBlock::serializedSize();
+    Record.AllocSites.push_back(Node);
+  }
+
+  // Read the callsite information.
+  const uint64_t NumCtxs =
+      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+  for (uint64_t J = 0; J < NumCtxs; J++) {
+    CallStackId CSId =
+        endian::readNext<CallStackId, llvm::endianness::little, unaligned>(Ptr);
+    Record.CallSiteIds.push_back(CSId);
+  }
+
+  return Record;
+}
+} // namespace
+
+IndexedMemProfRecord
+IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
+                                  const unsigned char *Ptr,
+                                  IndexedVersion Version) {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    return deserializeV0(Schema, Ptr);
+  case Version2:
+    return deserializeV2(Schema, Ptr);
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
 GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) {
   // Canonicalize the function name to drop suffixes such as ".llvm.". Note
   // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index c20609748dc9..224ea0924f0a 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -1240,53 +1240,17 @@ APInt APInt::sqrt() const {
   return x_old + 1;
 }
 
-/// Computes the multiplicative inverse of this APInt for a given modulo. The
-/// iterative extended Euclidean algorithm is used to solve for this value,
-/// however we simplify it to speed up calculating only the inverse, and take
-/// advantage of div+rem calculations. We also use some tricks to avoid copying
-/// (potentially large) APInts around.
-/// WARNING: a value of '0' may be returned,
-///          signifying that no multiplicative inverse exists!
-APInt APInt::multiplicativeInverse(const APInt& modulo) const {
-  assert(ult(modulo) && "This APInt must be smaller than the modulo");
-
-  // Using the properties listed at the following web page (accessed 06/21/08):
-  //   http://www.numbertheory.org/php/euclid.html
-  // (especially the properties numbered 3, 4 and 9) it can be proved that
-  // BitWidth bits suffice for all the computations in the algorithm implemented
-  // below. More precisely, this number of bits suffice if the multiplicative
-  // inverse exists, but may not suffice for the general extended Euclidean
-  // algorithm.
-
-  APInt r[2] = { modulo, *this };
-  APInt t[2] = { APInt(BitWidth, 0), APInt(BitWidth, 1) };
-  APInt q(BitWidth, 0);
-
-  unsigned i;
-  for (i = 0; r[i^1] != 0; i ^= 1) {
-    // An overview of the math without the confusing bit-flipping:
-    // q = r[i-2] / r[i-1]
-    // r[i] = r[i-2] % r[i-1]
-    // t[i] = t[i-2] - t[i-1] * q
-    udivrem(r[i], r[i^1], q, r[i]);
-    t[i] -= t[i^1] * q;
-  }
-
-  // If this APInt and the modulo are not coprime, there is no multiplicative
-  // inverse, so return 0. We check this by looking at the next-to-last
-  // remainder, which is the gcd(*this,modulo) as calculated by the Euclidean
-  // algorithm.
-  if (r[i] != 1)
-    return APInt(BitWidth, 0);
-
-  // The next-to-last t is the multiplicative inverse.  However, we are
-  // interested in a positive inverse. Calculate a positive one from a negative
-  // one if necessary. A simple addition of the modulo suffices because
-  // abs(t[i]) is known to be less than *this/2 (see the link above).
-  if (t[i].isNegative())
-    t[i] += modulo;
-
-  return std::move(t[i]);
+/// \returns the multiplicative inverse of an odd APInt modulo 2^BitWidth.
+APInt APInt::multiplicativeInverse() const {
+  assert((*this)[0] &&
+         "multiplicative inverse is only defined for odd numbers!");
+
+  // Use Newton's method.
+  APInt Factor = *this;
+  APInt T;
+  while (!(T = *this * Factor).isOne())
+    Factor *= 2 - T;
+  return Factor;
 }
 
 /// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 6425aa9b091f..3af427d526f8 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -391,9 +391,18 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
                                         "equivalent when the immediate does "
                                         "not fit in the encoding.">;
 
-def FeatureAddrLSLFast : SubtargetFeature<
-    "addr-lsl-fast", "HasAddrLSLFast", "true",
-    "Address operands with logical shift of up to 3 places are cheap">;
+// Address operands with shift amount 2 or 3 are fast on all Arm chips except
+// some old Apple cores (A7-A10?) which handle all shifts slowly. Cortex-A57
+// and derived designs through Cortex-X1 take an extra micro-op for shifts
+// of 1 or 4. Other Arm chips handle all shifted operands at the same speed
+// as unshifted operands.
+//
+// We don't try to model the behavior of the old Apple cores because new code
+// targeting A7 is very unlikely to actually run on an A7. The Cortex cores
+// are modeled by FeatureAddrLSLSlow14.
+def FeatureAddrLSLSlow14 : SubtargetFeature<
+    "addr-lsl-slow-14", "HasAddrLSLSlow14", "true",
+    "Address operands with shift amount of 1 or 4 are slow">;
 
 def FeatureALULSLFast : SubtargetFeature<
     "alu-lsl-fast", "HasALULSLFast", "true",
@@ -885,6 +894,7 @@ def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    FeatureBalanceFPOps,
                                    FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals,
+                                   FeatureAddrLSLSlow14,
                                    FeaturePostRAScheduler,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -903,6 +913,7 @@ def TuneA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals,
+                                   FeatureAddrLSLSlow14,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -910,6 +921,7 @@ def TuneA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    "Cortex-A73 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
+                                   FeatureAddrLSLSlow14,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -917,6 +929,7 @@ def TuneA75     : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
                                    "Cortex-A75 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
+                                   FeatureAddrLSLSlow14,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -924,7 +937,7 @@ def TuneA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
                                    "Cortex-A76 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
+                                   FeatureAddrLSLSlow14,
                                    FeatureALULSLFast,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -934,7 +947,7 @@ def TuneA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
+                                   FeatureAddrLSLSlow14,
                                    FeatureALULSLFast,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -944,7 +957,7 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
                                FeatureCmpBccFusion,
                                FeatureFuseAES,
                                FeatureFuseAdrpAdd,
-                               FeatureAddrLSLFast,
+                               FeatureAddrLSLSlow14,
                                FeatureALULSLFast,
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
@@ -956,7 +969,7 @@ def TuneA78AE : SubtargetFeature<"a78ae", "ARMProcFamily",
                                  FeatureCmpBccFusion,
                                  FeatureFuseAES,
                                  FeatureFuseAdrpAdd,
-                                 FeatureAddrLSLFast,
+                                 FeatureAddrLSLSlow14,
                                  FeatureALULSLFast,
                                  FeaturePostRAScheduler,
                                  FeatureEnableSelectOptimize,
@@ -968,7 +981,7 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
                                 FeatureCmpBccFusion,
                                 FeatureFuseAES,
                                 FeatureFuseAdrpAdd,
-                                FeatureAddrLSLFast,
+                                FeatureAddrLSLSlow14,
                                 FeatureALULSLFast,
                                 FeaturePostRAScheduler,
                                 FeatureEnableSelectOptimize,
@@ -979,7 +992,6 @@ def TuneA710    : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeaturePostRAScheduler,
                                    FeatureEnableSelectOptimize,
@@ -990,7 +1002,6 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureAddrLSLFast,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
@@ -1001,7 +1012,6 @@ def TuneA720 : SubtargetFeature<"a720", "ARMProcFamily", "CortexA720",
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureAddrLSLFast,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
@@ -1012,7 +1022,6 @@ def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720",
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureAddrLSLFast,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
@@ -1028,7 +1037,7 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
-                                  FeatureAddrLSLFast,
+                                  FeatureAddrLSLSlow14,
                                   FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
@@ -1039,7 +1048,6 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
-                                  FeatureAddrLSLFast,
                                   FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
@@ -1047,7 +1055,6 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
 
 def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
                               "Cortex-X3 ARM processors", [
-                               FeatureAddrLSLFast,
                                FeatureALULSLFast,
                                FeatureFuseAdrpAdd,
                                FeatureFuseAES,
@@ -1057,7 +1064,6 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
 
 def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
                               "Cortex-X4 ARM processors", [
-                               FeatureAddrLSLFast,
                                FeatureALULSLFast,
                                FeatureFuseAdrpAdd,
                                FeatureFuseAES,
@@ -1215,7 +1221,6 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
                                      FeatureStorePairSuppress,
-                                     FeatureAddrLSLFast,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
                                      FeaturePredictableSelectIsExpensive]>;
@@ -1234,7 +1239,6 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
                                      FeatureStorePairSuppress,
-                                     FeatureAddrLSLFast,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
                                      FeatureZCZeroing]>;
@@ -1244,7 +1248,6 @@ def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureStorePairSuppress]>;
 
@@ -1254,7 +1257,6 @@ def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
                                    FeatureStorePairSuppress,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureSlowSTRQro]>;
 
@@ -1268,7 +1270,7 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1
                                       "Neoverse N1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
+                                      FeatureAddrLSLSlow14,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1278,7 +1280,6 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
                                       "Neoverse N2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1288,7 +1289,6 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
                                       "Neoverse 512-TVB ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1298,7 +1298,7 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
                                       "Neoverse V1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
+                                      FeatureAddrLSLSlow14,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1309,7 +1309,6 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       "Neoverse V2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1321,7 +1320,6 @@ def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
                                    FeatureStorePairSuppress,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast]>;
 
 def TuneThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
@@ -1381,7 +1379,6 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
                                    FeaturePostRAScheduler,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureAggressiveFMA,
                                    FeatureArithmeticBccFusion,
@@ -1397,7 +1394,6 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeaturePostRAScheduler,
                                     FeatureFuseAES,
                                     FeatureFuseAdrpAdd,
-                                    FeatureAddrLSLFast,
                                     FeatureALULSLFast,
                                     FeatureAggressiveFMA,
                                     FeatureArithmeticBccFusion,
@@ -1414,7 +1410,6 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
                                     FeaturePostRAScheduler,
                                     FeatureFuseAES,
                                     FeatureFuseAdrpAdd,
-                                    FeatureAddrLSLFast,
                                     FeatureALULSLFast,
                                     FeatureAggressiveFMA,
                                     FeatureArithmeticBccFusion,
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 4fa719ad67cf..f6ccd0ecfdc8 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -268,13 +268,19 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
     if (Sign->getZExtValue())
       Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
 
-  if (Flags == 0)
-    return;
+  uint64_t PAuthABIPlatform = -1;
+  if (const auto *PAP = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("aarch64-elf-pauthabi-platform")))
+    PAuthABIPlatform = PAP->getZExtValue();
+  uint64_t PAuthABIVersion = -1;
+  if (const auto *PAV = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("aarch64-elf-pauthabi-version")))
+    PAuthABIVersion = PAV->getZExtValue();
 
   // Emit a .note.gnu.property section with the flags.
   auto *TS =
       static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
-  TS->emitNoteSection(Flags);
+  TS->emitNoteSection(Flags, PAuthABIPlatform, PAuthABIVersion);
 }
 
 void AArch64AsmPrinter::emitFunctionHeaderComment() {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 163ed520a8a6..51bec3604026 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -462,7 +462,7 @@ private:
                          SDValue &Offset, SDValue &SignExtend,
                          SDValue &DoShift);
   bool isWorthFoldingALU(SDValue V, bool LSL = false) const;
-  bool isWorthFoldingAddr(SDValue V) const;
+  bool isWorthFoldingAddr(SDValue V, unsigned Size) const;
   bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
                          SDValue &Offset, SDValue &SignExtend);
 
@@ -674,17 +674,22 @@ static bool isWorthFoldingSHL(SDValue V) {
 
 /// Determine whether it is worth to fold V into an extended register addressing
 /// mode.
-bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V) const {
+bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V, unsigned Size) const {
   // Trivial if we are optimizing for code size or if there is only
   // one use of the value.
   if (CurDAG->shouldOptForSize() || V.hasOneUse())
     return true;
-  // If a subtarget has a fastpath LSL we can fold a logical shift into
-  // the addressing mode and save a cycle.
-  if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::SHL &&
-      isWorthFoldingSHL(V))
+
+  // If a subtarget has a slow shift, folding a shift into multiple loads
+  // costs additional micro-ops.
+  if (Subtarget->hasAddrLSLSlow14() && (Size == 2 || Size == 16))
+    return false;
+
+  // Check whether we're going to emit the address arithmetic anyway because
+  // it's used by a non-address operation.
+  if (V.getOpcode() == ISD::SHL && isWorthFoldingSHL(V))
     return true;
-  if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::ADD) {
+  if (V.getOpcode() == ISD::ADD) {
     const SDValue LHS = V.getOperand(0);
     const SDValue RHS = V.getOperand(1);
     if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
@@ -1203,7 +1208,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
   if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
     return false;
 
-  return isWorthFoldingAddr(N);
+  return isWorthFoldingAddr(N, Size);
 }
 
 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1231,7 +1236,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
   }
 
   // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
+  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size);
 
   // Try to match a shifted extend on the RHS.
   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
@@ -1261,7 +1266,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
     Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
                                            MVT::i32);
-    if (isWorthFoldingAddr(LHS))
+    if (isWorthFoldingAddr(LHS, Size))
       return true;
   }
 
@@ -1273,7 +1278,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
     Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
                                            MVT::i32);
-    if (isWorthFoldingAddr(RHS))
+    if (isWorthFoldingAddr(RHS, Size))
       return true;
   }
 
@@ -1343,7 +1348,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
   }
 
   // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
+  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size);
 
   // Try to match a shifted extend on the RHS.
   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d0c5e6b99e9e..22687b0e31c2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2993,7 +2993,7 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
       return false;
     Shift = AArch64_AM::getShiftValue(Shift);
     if (!OptSize) {
-      if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast())
+      if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
         return false;
       if (avoidSlowSTRQ(MemI))
         return false;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index a8f2c45279e6..d4daf17a39d5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6907,10 +6907,8 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
       MI.getParent()->getParent()->getFunction().hasOptSize())
     return true;
 
-  // It's better to avoid folding and recomputing shifts when we don't have a
-  // fastpath.
-  if (!STI.hasAddrLSLFast())
-    return false;
+  // FIXME: Consider checking HasAddrLSLSlow14 and HasALULSLFast as
+  // appropriate.
 
   // We have a fastpath, so folding a shift in and potentially computing it
   // many times may be beneficial. Check if this is only used in memory ops.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index e1d6dd7a056b..dc5383ce941e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -58,8 +58,17 @@ void AArch64TargetStreamer::finish() {
     emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
 }
 
-void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
-  if (Flags == 0)
+void AArch64TargetStreamer::emitNoteSection(unsigned Flags,
+                                            uint64_t PAuthABIPlatform,
+                                            uint64_t PAuthABIVersion) {
+  assert((PAuthABIPlatform == uint64_t(-1)) ==
+         (PAuthABIVersion == uint64_t(-1)));
+  uint64_t DescSz = 0;
+  if (Flags != 0)
+    DescSz += 4 * 4;
+  if (PAuthABIPlatform != uint64_t(-1))
+    DescSz += 4 + 4 + 8 * 2;
+  if (DescSz == 0)
     return;
 
   MCStreamer &OutStreamer = getStreamer();
@@ -80,15 +89,25 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
   // Emit the note header.
   OutStreamer.emitValueToAlignment(Align(8));
   OutStreamer.emitIntValue(4, 4);     // data size for "GNU\0"
-  OutStreamer.emitIntValue(4 * 4, 4); // Elf_Prop size
+  OutStreamer.emitIntValue(DescSz, 4); // Elf_Prop array size
   OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4);
   OutStreamer.emitBytes(StringRef("GNU", 4)); // note name
 
   // Emit the PAC/BTI properties.
-  OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
-  OutStreamer.emitIntValue(4, 4);     // data size
-  OutStreamer.emitIntValue(Flags, 4); // data
-  OutStreamer.emitIntValue(0, 4);     // pad
+  if (Flags != 0) {
+    OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
+    OutStreamer.emitIntValue(4, 4);     // data size
+    OutStreamer.emitIntValue(Flags, 4); // data
+    OutStreamer.emitIntValue(0, 4);     // pad
+  }
+
+  // Emit the PAuth ABI compatibility info
+  if (PAuthABIPlatform != uint64_t(-1)) {
+    OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_PAUTH, 4);
+    OutStreamer.emitIntValue(8 * 2, 4); // data size
+    OutStreamer.emitIntValue(PAuthABIPlatform, 8);
+    OutStreamer.emitIntValue(PAuthABIVersion, 8);
+  }
 
   OutStreamer.endSection(Nt);
   OutStreamer.switchSection(Cur);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 7676d88a82b5..e8a9dc445b96 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -35,7 +35,8 @@ public:
   void emitCurrentConstantPool();
 
   /// Callback used to implement the .note.gnu.property section.
-  void emitNoteSection(unsigned Flags);
+  void emitNoteSection(unsigned Flags, uint64_t PAuthABIPlatform = -1,
+                       uint64_t PAuthABIVersion = -1);
 
   /// Callback used to implement the .inst directive.
   virtual void emitInst(uint32_t Inst);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 9083150b3384..1114a8c40114 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1086,7 +1086,7 @@ void SplitPtrStructs::processConditionals() {
       if (MaybeRsrc)
         for (Value *V : Seen)
           FoundRsrcs[cast<Instruction>(V)] = NewRsrc;
-    } else if (auto *SI = dyn_cast<SelectInst>(I)) {
+    } else if (isa<SelectInst>(I)) {
       if (MaybeRsrc) {
         ConditionalTemps.push_back(cast<Instruction>(Rsrc));
         Rsrc->replaceAllUsesWith(*MaybeRsrc);
@@ -1777,8 +1777,8 @@ void SplitPtrStructs::processFunction(Function &F) {
     Originals.push_back(&I);
   for (Instruction *I : Originals) {
     auto [Rsrc, Off] = visit(I);
-    assert((Rsrc && Off) ||
-           (!Rsrc && !Off) && "Can't have a resource but no offset");
+    assert(((Rsrc && Off) || (!Rsrc && !Off)) &&
+           "Can't have a resource but no offset");
     if (Rsrc)
       RsrcParts[I] = Rsrc;
     if (Off)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 27621906e4c5..bb499c5c8c57 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -708,9 +708,6 @@ public:
                                  WaitcntBrackets &ScoreBrackets,
                                  MachineInstr *OldWaitcntInstr,
                                  bool FlushVmCnt);
-  bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
-                               WaitcntBrackets &ScoreBrackets,
-                               MachineInstr *OldWaitcntInstr);
   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
                        MachineBasicBlock::instr_iterator It,
                        MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
@@ -1902,31 +1899,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
                          OldWaitcntInstr);
 }
 
-// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
-// end of the given block if needed.
-bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
-                                               WaitcntBrackets &ScoreBrackets,
-                                               MachineInstr *OldWaitcntInstr) {
-  AMDGPU::Waitcnt Wait;
-
-  unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
-  unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
-  unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
-
-  if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
-    return false;
-
-  if (LoadCntPending != 0)
-    Wait.LoadCnt = 0;
-  if (SampleCntPending != 0)
-    Wait.SampleCnt = 0;
-  if (BvhCntPending != 0)
-    Wait.BvhCnt = 0;
-
-  return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
-                         OldWaitcntInstr);
-}
-
 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
                                        MachineBasicBlock::instr_iterator It,
                                        MachineBasicBlock &Block,
@@ -2355,9 +2327,22 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     ++Iter;
   }
 
+  // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
+  // needed.
+  AMDGPU::Waitcnt Wait;
   if (Block.getFirstTerminator() == Block.end() &&
-      isPreheaderToFlush(Block, ScoreBrackets))
-    Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
+      isPreheaderToFlush(Block, ScoreBrackets)) {
+    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+      Wait.LoadCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+      Wait.SampleCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+      Wait.BvhCnt = 0;
+  }
+
+  // Combine or remove any redundant waitcnts at the end of the block.
+  Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
+                              OldWaitcntInstr);
 
   return Modified;
 }
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 4f34514c2cfe..86e44343b508 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -290,16 +290,7 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
   switch (Opc) {
   case TargetOpcode::G_ADD:
-  case TargetOpcode::G_SUB: {
-    if (MRI.getType(MI.getOperand(0).getReg()).isVector()) {
-      LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-      return getInstructionMapping(
-          DefaultMappingID, /*Cost=*/1,
-          getVRBValueMapping(Ty.getSizeInBits().getKnownMinValue()),
-          NumOperands);
-    }
-  }
-    LLVM_FALLTHROUGH;
+  case TargetOpcode::G_SUB:
   case TargetOpcode::G_SHL:
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_LSHR:
@@ -320,10 +311,6 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_PTR_ADD:
   case TargetOpcode::G_PTRTOINT:
   case TargetOpcode::G_INTTOPTR:
-  case TargetOpcode::G_SEXTLOAD:
-  case TargetOpcode::G_ZEXTLOAD:
-    return getInstructionMapping(DefaultMappingID, /*Cost=*/1, GPRValueMapping,
-                                 NumOperands);
   case TargetOpcode::G_FADD:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMUL:
@@ -334,25 +321,48 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FMINNUM: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    return getInstructionMapping(DefaultMappingID, /*Cost=*/1,
-                                 getFPValueMapping(Ty.getSizeInBits()),
-                                 NumOperands);
+    TypeSize Size = Ty.getSizeInBits();
+
+    const ValueMapping *Mapping;
+    if (Ty.isVector())
+      Mapping = getVRBValueMapping(Size.getKnownMinValue());
+    else if (isPreISelGenericFloatingPointOpcode(Opc))
+      Mapping = getFPValueMapping(Size.getFixedValue());
+    else
+      Mapping = GPRValueMapping;
+
+#ifndef NDEBUG
+    // Make sure all the operands are using similar size and type.
+    for (unsigned Idx = 1; Idx != NumOperands; ++Idx) {
+      LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg());
+      assert(Ty.isVector() == OpTy.isVector() &&
+             "Operand has incompatible type");
+      // Don't check size for GPR.
+      if (OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc))
+        assert(Size == OpTy.getSizeInBits() && "Operand has incompatible size");
+    }
+#endif // End NDEBUG
+
+    return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands);
   }
+  case TargetOpcode::G_SEXTLOAD:
+  case TargetOpcode::G_ZEXTLOAD:
+    return getInstructionMapping(DefaultMappingID, /*Cost=*/1, GPRValueMapping,
+                                 NumOperands);
   case TargetOpcode::G_IMPLICIT_DEF: {
     Register Dst = MI.getOperand(0).getReg();
     LLT DstTy = MRI.getType(Dst);
-    uint64_t DstMinSize = DstTy.getSizeInBits().getKnownMinValue();
+    unsigned DstMinSize = DstTy.getSizeInBits().getKnownMinValue();
     auto Mapping = GPRValueMapping;
     // FIXME: May need to do a better job determining when to use FPRB.
     // For example, the look through COPY case:
     // %0:_(s32) = G_IMPLICIT_DEF
     // %1:_(s32) = COPY %0
     // $f10_d = COPY %1(s32)
-    if (anyUseOnlyUseFP(Dst, MRI, TRI))
-      Mapping = getFPValueMapping(DstMinSize);
-
     if (DstTy.isVector())
       Mapping = getVRBValueMapping(DstMinSize);
+    else if (anyUseOnlyUseFP(Dst, MRI, TRI))
+      Mapping = getFPValueMapping(DstMinSize);
 
     return getInstructionMapping(DefaultMappingID, /*Cost=*/1, Mapping,
                                  NumOperands);
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 55ba4949b3ea..f99dc0b85763 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3287,24 +3287,24 @@ bool RISCVDAGToDAGISel::selectVSplatUimm(SDValue N, unsigned Bits,
 }
 
 bool RISCVDAGToDAGISel::selectLow8BitsVSplat(SDValue N, SDValue &SplatVal) {
-  // Truncates are custom lowered during legalization.
-  auto IsTrunc = [this](SDValue N) {
-    if (N->getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
+  auto IsExtOrTrunc = [](SDValue N) {
+    switch (N->getOpcode()) {
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    // There's no passthru on these _VL nodes so any VL/mask is ok, since any
+    // inactive elements will be undef.
+    case RISCVISD::TRUNCATE_VECTOR_VL:
+    case RISCVISD::VSEXT_VL:
+    case RISCVISD::VZEXT_VL:
+      return true;
+    default:
       return false;
-    SDValue VL;
-    selectVLOp(N->getOperand(2), VL);
-    // Any vmset_vl is ok, since any bits past VL are undefined and we can
-    // assume they are set.
-    return N->getOperand(1).getOpcode() == RISCVISD::VMSET_VL &&
-           isa<ConstantSDNode>(VL) &&
-           cast<ConstantSDNode>(VL)->getSExtValue() == RISCV::VLMaxSentinel;
+    }
   };
 
-  // We can have multiple nested truncates, so unravel them all if needed.
-  while (N->getOpcode() == ISD::SIGN_EXTEND ||
-         N->getOpcode() == ISD::ZERO_EXTEND || IsTrunc(N)) {
-    if (!N.hasOneUse() ||
-        N.getValueType().getSizeInBits().getKnownMinValue() < 8)
+  // We can have multiple nested nodes, so unravel them all if needed.
+  while (IsExtOrTrunc(N)) {
+    if (!N.hasOneUse() || N.getScalarValueSizeInBits() < 8)
       return false;
     N = N->getOperand(0);
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index cc44092700c6..73d52d5ecafb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -387,6 +387,9 @@ def SDT_RISCVVEXTEND_VL : SDTypeProfile<1, 3, [SDTCisVec<0>,
                                                SDTCisVT<3, XLenVT>]>;
 def riscv_sext_vl : SDNode<"RISCVISD::VSEXT_VL", SDT_RISCVVEXTEND_VL>;
 def riscv_zext_vl : SDNode<"RISCVISD::VZEXT_VL", SDT_RISCVVEXTEND_VL>;
+def riscv_ext_vl : PatFrags<(ops node:$A, node:$B, node:$C),
+                            [(riscv_sext_vl node:$A, node:$B, node:$C),
+                             (riscv_zext_vl node:$A, node:$B, node:$C)]>;
 
 def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL",
                                    SDTypeProfile<1, 3, [SDTCisVec<0>,
@@ -535,6 +538,11 @@ def riscv_zext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
   return N->hasOneUse();
 }]>;
 
+def riscv_ext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
+                          (riscv_ext_vl node:$A, node:$B, node:$C), [{
+  return N->hasOneUse();
+}]>;
+
 def riscv_fpextend_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
                            (riscv_fpextend_vl node:$A, node:$B, node:$C), [{
   return N->hasOneUse();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 51a7a0a15d97..c1facc790fc0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -630,6 +630,19 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
+                 (wti.Vector (riscv_zext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs2),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector (riscv_ext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs1),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector wti.RegClass:$merge),
+                 (vti.Mask V0), VLOpFrag),
+              (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
+                 wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+    def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
                  (wti.Vector wti.RegClass:$merge),
@@ -639,6 +652,17 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
+                 (wti.Vector (riscv_zext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs2),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
+                 (wti.Vector wti.RegClass:$merge),
+                 (vti.Mask V0), VLOpFrag),
+              (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
+                 wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
+    def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
                  (wti.Vector wti.RegClass:$merge),
@@ -647,6 +671,17 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
+    def : Pat<(riscv_shl_vl
+                 (wti.Vector (riscv_zext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs2),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
+                 (wti.Vector wti.RegClass:$merge),
+                 (vti.Mask V0), VLOpFrag),
+              (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
+                 wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector vti.RegClass:$rs1),
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index ba108912d934..85f8f5f654fe 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -254,6 +254,7 @@ public:
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
 
+  bool isTargetAndroid() const { return getTargetTriple().isAndroid(); }
   bool isTargetFuchsia() const { return getTargetTriple().isOSFuchsia(); }
 
   bool useConstantPoolForLargeInts() const;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 27a4d78d6df7..aeec06313c75 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -245,6 +245,10 @@ RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
   return TTI::TCC_Free;
 }
 
+bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
+  return ST->hasVInstructions();
+}
+
 TargetTransformInfo::PopcntSupportKind
 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index ac32aea4ce2b..c0169ea1ad53 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -78,6 +78,22 @@ public:
                                       const APInt &Imm, Type *Ty,
                                       TTI::TargetCostKind CostKind);
 
+  /// \name EVL Support for predicated vectorization.
+  /// Whether the target supports the %evl parameter of VP intrinsic efficiently
+  /// in hardware, for the given opcode and type/alignment. (see LLVM Language
+  /// Reference - "Vector Predication Intrinsics",
+  /// https://llvm.org/docs/LangRef.html#vector-predication-intrinsics and
+  /// "IR-level VP intrinsics",
+  /// https://llvm.org/docs/Proposals/VectorPredication.html#ir-level-vp-intrinsics).
+  /// \param Opcode the opcode of the instruction checked for predicated version
+  /// support.
+  /// \param DataType the type of the instruction with the \p Opcode checked for
+  /// prediction support.
+  /// \param Alignment the alignment for memory access operation checked for
+  /// predicated version support.
+  bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
+                             Align Alignment) const;
+
   TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
   bool shouldExpandReduction(const IntrinsicInst *II) const;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 1674cef7cb82..9e4ba2191366 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -243,8 +243,12 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
       continue;
 
     MetadataAsValue *VMD = cast<MetadataAsValue>(II->getOperand(1));
-    SPIRVType *ElementType = GR->getOrCreateSPIRVType(
-        cast<ConstantAsMetadata>(VMD->getMetadata())->getType(), MIRBuilder);
+    Type *ElementTy = cast<ConstantAsMetadata>(VMD->getMetadata())->getType();
+    if (isUntypedPointerTy(ElementTy))
+      ElementTy =
+          TypedPointerType::get(IntegerType::getInt8Ty(II->getContext()),
+                                getPointerAddressSpace(ElementTy));
+    SPIRVType *ElementType = GR->getOrCreateSPIRVType(ElementTy, MIRBuilder);
     return GR->getOrCreateSPIRVPointerType(
         ElementType, MIRBuilder,
         addressSpaceToStorageClass(
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index e0099e529447..ac799374adce 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -47,7 +47,7 @@ class SPIRVGlobalRegistry {
   DenseMap<const MachineOperand *, const Function *> InstrToFunction;
   // Maps Functions to their calls (in a form of the machine instruction,
   // OpFunctionCall) that happened before the definition is available
-  DenseMap<const Function *, SmallVector<MachineInstr *>> ForwardCalls;
+  DenseMap<const Function *, SmallPtrSet<MachineInstr *, 8>> ForwardCalls;
 
   // Look for an equivalent of the newType in the map. Return the equivalent
   // if it's found, otherwise insert newType to the map and return the type.
@@ -215,12 +215,12 @@ public:
     if (It == ForwardCalls.end())
       ForwardCalls[F] = {MI};
     else
-      It->second.push_back(MI);
+      It->second.insert(MI);
   }
 
   // Map a Function to the vector of machine instructions that represents
   // forward function calls or to nullptr if not found.
-  SmallVector<MachineInstr *> *getForwardCalls(const Function *F) {
+  SmallPtrSet<MachineInstr *, 8> *getForwardCalls(const Function *F) {
     auto It = ForwardCalls.find(F);
     return It == ForwardCalls.end() ? nullptr : &It->second;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 90a31551f45a..d450078d793f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -193,7 +193,7 @@ void validateForwardCalls(const SPIRVSubtarget &STI,
                           MachineRegisterInfo *DefMRI, SPIRVGlobalRegistry &GR,
                           MachineInstr &FunDef) {
   const Function *F = GR.getFunctionByDefinition(&FunDef);
-  if (SmallVector<MachineInstr *> *FwdCalls = GR.getForwardCalls(F))
+  if (SmallPtrSet<MachineInstr *, 8> *FwdCalls = GR.getForwardCalls(F))
     for (MachineInstr *FunCall : *FwdCalls) {
       MachineRegisterInfo *CallMRI =
           &FunCall->getParent()->getParent()->getRegInfo();
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index f4525e713c98..49749b563453 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1825,7 +1825,24 @@ bool SPIRVInstructionSelector::selectAllocaArray(Register ResVReg,
 bool SPIRVInstructionSelector::selectFrameIndex(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I) const {
-  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
+  // Change order of instructions if needed: all OpVariable instructions in a
+  // function must be the first instructions in the first block
+  MachineFunction *MF = I.getParent()->getParent();
+  MachineBasicBlock *MBB = &MF->front();
+  auto It = MBB->SkipPHIsAndLabels(MBB->begin()), E = MBB->end();
+  bool IsHeader = false;
+  unsigned Opcode;
+  for (; It != E && It != I; ++It) {
+    Opcode = It->getOpcode();
+    if (Opcode == SPIRV::OpFunction || Opcode == SPIRV::OpFunctionParameter) {
+      IsHeader = true;
+    } else if (IsHeader &&
+               !(Opcode == SPIRV::ASSIGN_TYPE || Opcode == SPIRV::OpLabel)) {
+      ++It;
+      break;
+    }
+  }
+  return BuildMI(*MBB, It, It->getDebugLoc(), TII.get(SPIRV::OpVariable))
       .addDef(ResVReg)
       .addUse(GR.getSPIRVTypeID(ResType))
       .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function))
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 215a8ea83190..6855471840e9 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -434,6 +434,50 @@ bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     default:
       // See if this is a generic print operand
       return AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O);
+    case 'L': // Low order register of a twin word register operand
+    case 'H': // High order register of a twin word register operand
+    {
+      const SparcSubtarget &Subtarget = MF->getSubtarget<SparcSubtarget>();
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      const SparcRegisterInfo *RegisterInfo = Subtarget.getRegisterInfo();
+      Register MOReg = MO.getReg();
+
+      Register HiReg, LoReg;
+      if (!SP::IntPairRegClass.contains(MOReg)) {
+        // If we aren't given a register pair already, find out which pair it
+        // belongs to. Note that here, the specified register operand, which
+        // refers to the high part of the twinword, needs to be an even-numbered
+        // register.
+        MOReg = RegisterInfo->getMatchingSuperReg(MOReg, SP::sub_even,
+                                                  &SP::IntPairRegClass);
+        if (!MOReg) {
+          SMLoc Loc;
+          OutContext.reportError(
+              Loc, "Hi part of pair should point to an even-numbered register");
+          OutContext.reportError(
+              Loc, "(note that in some cases it might be necessary to manually "
+                   "bind the input/output registers instead of relying on "
+                   "automatic allocation)");
+          return true;
+        }
+      }
+
+      HiReg = RegisterInfo->getSubReg(MOReg, SP::sub_even);
+      LoReg = RegisterInfo->getSubReg(MOReg, SP::sub_odd);
+
+      Register Reg;
+      switch (ExtraCode[0]) {
+      case 'L':
+        Reg = LoReg;
+        break;
+      case 'H':
+        Reg = HiReg;
+        break;
+      }
+
+      O << '%' << SparcInstPrinter::getRegisterName(Reg);
+      return false;
+    }
     case 'f':
     case 'r':
      break;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a9751e170fe6..6f65344215c0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42725,6 +42725,8 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   switch (Op.getOpcode()) {
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
+  case X86ISD::UNPCKH:
+  case X86ISD::UNPCKL:
     return false;
   }
   return TargetLowering::canCreateUndefOrPoisonForTargetNode(
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce3b6af4cab4..270dd32c7235 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2161,6 +2161,11 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
     def : Pat<(X86sub_flag_nocf GR16:$src, -1), (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
     def : Pat<(X86sub_flag_nocf GR32:$src, -1), (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
     def : Pat<(X86sub_flag_nocf GR64:$src, -1), (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
+
+    def : Pat<(or_is_add GR8:$src, 1),   (!cast<Instruction>(INC8r#suffix) GR8:$src)>;
+    def : Pat<(or_is_add GR16:$src, 1),  (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
+    def : Pat<(or_is_add GR32:$src, 1),  (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
+    def : Pat<(or_is_add GR64:$src, 1),  (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index f24334312c11..a5b2e4895ede 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6276,10 +6276,10 @@ static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
   case X86::RCPSSm:
   case X86::RCPSSr_Int:
   case X86::RCPSSm_Int:
-  case X86::ROUNDSDr:
-  case X86::ROUNDSDm:
-  case X86::ROUNDSSr:
-  case X86::ROUNDSSm:
+  case X86::ROUNDSDri:
+  case X86::ROUNDSDmi:
+  case X86::ROUNDSSri:
+  case X86::ROUNDSSmi:
   case X86::RSQRTSSr:
   case X86::RSQRTSSm:
   case X86::RSQRTSSr_Int:
@@ -6778,14 +6778,14 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
   case X86::VRCPSSr_Int:
   case X86::VRCPSSm:
   case X86::VRCPSSm_Int:
-  case X86::VROUNDSDr:
-  case X86::VROUNDSDm:
-  case X86::VROUNDSDr_Int:
-  case X86::VROUNDSDm_Int:
-  case X86::VROUNDSSr:
-  case X86::VROUNDSSm:
-  case X86::VROUNDSSr_Int:
-  case X86::VROUNDSSm_Int:
+  case X86::VROUNDSDri:
+  case X86::VROUNDSDmi:
+  case X86::VROUNDSDri_Int:
+  case X86::VROUNDSDmi_Int:
+  case X86::VROUNDSSri:
+  case X86::VROUNDSSmi:
+  case X86::VROUNDSSri_Int:
+  case X86::VROUNDSSmi_Int:
   case X86::VRSQRTSSr:
   case X86::VRSQRTSSr_Int:
   case X86::VRSQRTSSm:
@@ -7516,8 +7516,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VRCPSSr_Int:
     case X86::RSQRTSSr_Int:
     case X86::VRSQRTSSr_Int:
-    case X86::ROUNDSSr_Int:
-    case X86::VROUNDSSr_Int:
+    case X86::ROUNDSSri_Int:
+    case X86::VROUNDSSri_Int:
     case X86::COMISSrr_Int:
     case X86::VCOMISSrr_Int:
     case X86::VCOMISSZrr_Int:
@@ -7685,8 +7685,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VCVTSD2USI64Zrr_Int:
     case X86::VCVTTSD2USIZrr_Int:
     case X86::VCVTTSD2USI64Zrr_Int:
-    case X86::ROUNDSDr_Int:
-    case X86::VROUNDSDr_Int:
+    case X86::ROUNDSDri_Int:
+    case X86::VROUNDSDri_Int:
     case X86::COMISDrr_Int:
     case X86::VCOMISDrr_Int:
     case X86::VCOMISDZrr_Int:
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 69d45366a1db..2b391b60f2c9 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5475,35 +5475,35 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
-  def r : SS4AIi8<opc, MRMSrcReg,
-                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
-                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
-                  Sched<[sched]>;
+  def ri : SS4AIi8<opc, MRMSrcReg,
+                   (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
+                   Sched<[sched]>;
 
   // Vector intrinsic operation, mem
-  def m : SS4AIi8<opc, MRMSrcMem,
-                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
-                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst,
-                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
-                  Sched<[sched.Folded]>;
+  def mi : SS4AIi8<opc, MRMSrcMem,
+                   (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                         (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
+                   Sched<[sched.Folded]>;
 }
 }
 
 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
                           string OpcodeStr, X86FoldableSchedWrite sched> {
 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SSr : SS4AIi8<opcss, MRMSrcReg,
+  def SSri : SS4AIi8<opcss, MRMSrcReg,
         (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
       []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SSm : SS4AIi8<opcss, MRMSrcMem,
+  def SSmi : SS4AIi8<opcss, MRMSrcMem,
         (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -5511,14 +5511,14 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
 
 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+  def SDri : SS4AIi8<opcsd, MRMSrcReg,
         (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
               "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+  def SDmi : SS4AIi8<opcsd, MRMSrcMem,
         (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -5530,44 +5530,44 @@ multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
                            string OpcodeStr, X86FoldableSchedWrite sched> {
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SSr : SS4AIi8<opcss, MRMSrcReg,
-                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched]>;
+  def SSri : SS4AIi8<opcss, MRMSrcReg,
+                     (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SSm : SS4AIi8<opcss, MRMSrcMem,
-                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  def SSmi : SS4AIi8<opcss, MRMSrcMem,
+                     (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
 
 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SDr : SS4AIi8<opcsd, MRMSrcReg,
-                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched]>;
+  def SDri : SS4AIi8<opcsd, MRMSrcReg,
+                     (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SDm : SS4AIi8<opcsd, MRMSrcMem,
-                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  def SDmi : SS4AIi8<opcsd, MRMSrcMem,
+                     (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
 }
 }
 
-multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
-                            string OpcodeStr, X86FoldableSchedWrite sched,
-                            ValueType VT32, ValueType VT64,
-                            SDNode OpNode, bit Is2Addr = 1> {
+multiclass sse41_fp_unop_s_int<bits<8> opcss, bits<8> opcsd,
+                               string OpcodeStr, X86FoldableSchedWrite sched,
+                               ValueType VT32, ValueType VT64,
+                               SDNode OpNode, bit Is2Addr = 1> {
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let ExeDomain = SSEPackedSingle in {
-  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+  def SSri_Int : SS4AIi8<opcss, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5577,7 +5577,7 @@ let ExeDomain = SSEPackedSingle in {
         [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
         Sched<[sched]>;
 
-  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+  def SSmi_Int : SS4AIi8<opcss, MRMSrcMem,
         (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5590,7 +5590,7 @@ let ExeDomain = SSEPackedSingle in {
 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
 
 let ExeDomain = SSEPackedDouble in {
-  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+  def SDri_Int : SS4AIi8<opcsd, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5600,7 +5600,7 @@ let ExeDomain = SSEPackedDouble in {
         [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
         Sched<[sched]>;
 
-  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+  def SDmi_Int : SS4AIi8<opcsd, MRMSrcMem,
         (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5636,25 +5636,25 @@ let Predicates = [HasAVX, NoVLX] in {
   }
 }
 let Predicates = [UseAVX] in {
-  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
-                                  v4f32, v2f64, X86RndScales, 0>,
-                                  VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
+  defm VROUND  : sse41_fp_unop_s_int<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
+                                     v4f32, v2f64, X86RndScales, 0>,
+                                     VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
                                 VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
 }
 
 let Predicates = [UseAVX] in {
   def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
-            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
+            (VROUNDSSri (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
+            (VROUNDSDri (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
 }
 
 let Predicates = [UseAVX, OptForSize] in {
   def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
-            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+            (VROUNDSSmi (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
-            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+            (VROUNDSDmi (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -5667,21 +5667,21 @@ defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
 defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
 
 let Constraints = "$src1 = $dst" in
-defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
-                               v4f32, v2f64, X86RndScales>;
+defm ROUND  : sse41_fp_unop_s_int<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
+                                  v4f32, v2f64, X86RndScales>;
 
 let Predicates = [UseSSE41] in {
   def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
-            (ROUNDSSr FR32:$src1, timm:$src2)>;
+            (ROUNDSSri FR32:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
-            (ROUNDSDr FR64:$src1, timm:$src2)>;
+            (ROUNDSDri FR64:$src1, timm:$src2)>;
 }
 
 let Predicates = [UseSSE41, OptForSize] in {
   def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
-            (ROUNDSSm addr:$src1, timm:$src2)>;
+            (ROUNDSSmi addr:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
-            (ROUNDSDm addr:$src1, timm:$src2)>;
+            (ROUNDSDmi addr:$src1, timm:$src2)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index b3ee7a82b917..63ac91028ac9 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -329,11 +329,9 @@ defm : X86WriteRes<WriteDPPSY,     [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4>;
 defm : X86WriteRes<WriteDPPSLd,    [BWPort0,BWPort1,BWPort5,BWPort06,BWPort23], 19, [2,1,1,1,1], 6>;
 defm : X86WriteRes<WriteDPPSYLd,   [BWPort0,BWPort1,BWPort5,BWPort06,BWPort23], 20, [2,1,1,1,1], 6>;
 defm : BWWriteResPair<WriteFSign,     [BWPort5], 1>; // Floating point fabs/fchs.
-defm : X86WriteRes<WriteFRnd,            [BWPort23],  6, [1],   1>; // Floating point rounding.
-defm : X86WriteRes<WriteFRndY,           [BWPort23],  6, [1],   1>; // Floating point rounding (YMM/ZMM).
+defm : BWWriteResPair<WriteFRnd,      [BWPort1], 6, [2], 2, 5>; // Floating point rounding.
+defm : BWWriteResPair<WriteFRndY,     [BWPort1], 6, [2], 2, 6>; // Floating point rounding (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteFRndZ>;
-defm : X86WriteRes<WriteFRndLd,  [BWPort1,BWPort23], 11, [2,1], 3>;
-defm : X86WriteRes<WriteFRndYLd, [BWPort1,BWPort23], 12, [2,1], 3>;
 defm : BWWriteResPair<WriteFLogic,    [BWPort5], 1, [1], 1, 5>; // Floating point and/or/xor logicals.
 defm : BWWriteResPair<WriteFLogicY,   [BWPort5], 1, [1], 1, 6>; // Floating point and/or/xor logicals (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteFLogicZ>;
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 6c301a3cd342..516dc62f1b6d 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -329,12 +329,9 @@ defm : X86WriteRes<WriteDPPSY,     [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4>;
 defm : X86WriteRes<WriteDPPSLd,    [HWPort0,HWPort1,HWPort5,HWPort06,HWPort23], 20, [2,1,1,1,1], 6>;
 defm : X86WriteRes<WriteDPPSYLd,   [HWPort0,HWPort1,HWPort5,HWPort06,HWPort23], 21, [2,1,1,1,1], 6>;
 defm : HWWriteResPair<WriteFSign,  [HWPort0], 1>;
-defm : X86WriteRes<WriteFRnd,            [HWPort23],  6, [1],   1>;
-defm : X86WriteRes<WriteFRndY,           [HWPort23],  6, [1],   1>;
-defm : X86WriteRes<WriteFRndZ,           [HWPort23],  6, [1],   1>; // Unsupported = 1
-defm : X86WriteRes<WriteFRndLd,  [HWPort1,HWPort23], 12, [2,1], 3>;
-defm : X86WriteRes<WriteFRndYLd, [HWPort1,HWPort23], 13, [2,1], 3>;
-defm : X86WriteRes<WriteFRndZLd, [HWPort1,HWPort23], 13, [2,1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteFRnd,   [HWPort1], 6, [2], 2, 6>;
+defm : HWWriteResPair<WriteFRndY,  [HWPort1], 6, [2], 2, 7>;
+defm : HWWriteResPair<WriteFRndZ,  [HWPort1], 6, [2], 2, 7>; // Unsupported = 1
 defm : HWWriteResPair<WriteFLogic,  [HWPort5], 1, [1], 1, 6>;
 defm : HWWriteResPair<WriteFLogicY, [HWPort5], 1, [1], 1, 7>;
 defm : HWWriteResPair<WriteFLogicZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
index 88bb9ad8f1d7..ff3fe32be185 100644
--- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td
+++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
@@ -2290,8 +2290,8 @@ def SPRWriteResGroup218 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_11]> {
   let Latency = 15;
   let NumMicroOps = 3;
 }
-def : InstRW<[SPRWriteResGroup218], (instregex "^(V?)ROUNDP(D|S)m$")>;
-def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S)m((_Int)?)$",
+def : InstRW<[SPRWriteResGroup218], (instregex "^(V?)ROUNDP(D|S)mi$")>;
+def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S)mi((_Int)?)$",
                                                                 "^VRNDSCALEP(D|S)Z128rm(bi|ik)$",
                                                                 "^VRNDSCALEP(D|S)Z128rmbik(z?)$",
                                                                 "^VRNDSCALEP(D|S)Z128rmi((kz)?)$",
@@ -2303,13 +2303,13 @@ def SPRWriteResGroup219 : SchedWriteRes<[SPRPort00_01]> {
   let Latency = 8;
   let NumMicroOps = 2;
 }
-def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)r$",
-                                               "^(V?)ROUND(PS|SD)r$",
-                                               "^(V?)ROUNDS(D|S)r_Int$",
+def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)ri$",
+                                               "^(V?)ROUND(PS|SD)ri$",
+                                               "^(V?)ROUNDS(D|S)ri_Int$",
                                                "^VRNDSCALEP(D|S)Z(128|256)rri((k|kz)?)$",
                                                "^VRNDSCALES(D|S)Zr$",
                                                "^VRNDSCALES(D|S)Zr(b?)_Int((k|kz)?)$",
-                                               "^VROUNDP(D|S)Yr$")>;
+                                               "^VROUNDP(D|S)Yri$")>;
 
 def SPRWriteResGroup220 : SchedWriteRes<[SPRPort00_06]> {
   let ReleaseAtCycles = [2];
@@ -3737,7 +3737,7 @@ def SPRWriteResGroup390 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_11]> {
   let NumMicroOps = 3;
 }
 def : InstRW<[SPRWriteResGroup390], (instregex "^VF(C?)MADDCPHZ(128|256)m(b?)$",
-                                               "^VROUNDP(D|S)Ym$")>;
+                                               "^VROUNDP(D|S)Ymi$")>;
 def : InstRW<[SPRWriteResGroup390, ReadAfterVecXLd], (instregex "^VF(C?)MADDCSHZm$",
                                                                 "^VF(C?)MULCPHZ128rm(b?)$",
                                                                 "^VF(C?)MULCSHZrm$",
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index d90c8bd284eb..2e87d5262818 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -52,7 +52,7 @@ def Znver3Model : SchedMachineModel {
   int VecLoadLatency = 7;
   // Latency of a simple store operation.
   int StoreLatency = 1;
-  // FIXME
+  // FIXME:
   let HighLatency = 25; // FIXME: any better choice?
   // AMD SOG 19h, 2.8 Optimizing Branching
   // The branch misprediction penalty is in the range from 11 to 18 cycles,
@@ -193,11 +193,11 @@ def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
 // <...>, and six FPU pipes.
 // Agner, 22.10 Floating point execution pipes
 // There are six floating point/vector execution pipes,
-def Zn3FPP0  : ProcResource<1>;
-def Zn3FPP1  : ProcResource<1>;
-def Zn3FPP2  : ProcResource<1>;
-def Zn3FPP3  : ProcResource<1>;
-def Zn3FPP45 : ProcResource<2>;
+def Zn3FP0  : ProcResource<1>;
+def Zn3FP1  : ProcResource<1>;
+def Zn3FP2  : ProcResource<1>;
+def Zn3FP3  : ProcResource<1>;
+def Zn3FP45 : ProcResource<2>;
 
 //
 // Execution Units
@@ -205,63 +205,63 @@ def Zn3FPP45 : ProcResource<2>;
 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
 
 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
-defvar Zn3FPFMul0 = Zn3FPP0;
-defvar Zn3FPFMul1 = Zn3FPP1;
+defvar Zn3FPFMul0 = Zn3FP0;
+defvar Zn3FPFMul1 = Zn3FP1;
 
 // (v)FADD*
-defvar Zn3FPFAdd0 = Zn3FPP2;
-defvar Zn3FPFAdd1 = Zn3FPP3;
+defvar Zn3FPFAdd0 = Zn3FP2;
+defvar Zn3FPFAdd1 = Zn3FP3;
 
 // All convert operations except pack/unpack
-defvar Zn3FPFCvt0 = Zn3FPP2;
-defvar Zn3FPFCvt1 = Zn3FPP3;
+defvar Zn3FPFCvt0 = Zn3FP2;
+defvar Zn3FPFCvt1 = Zn3FP3;
 
 // All Divide and Square Root except Reciprocal Approximation
 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
 // FDIV unit can support 2 simultaneous operations in flight
 // even though it occupies a single pipe.
 // FIXME: BufferSize=2 ?
-defvar Zn3FPFDiv = Zn3FPP1;
+defvar Zn3FPFDiv = Zn3FP1;
 
 // Moves and Logical operations on Floating Point Data Types
-defvar Zn3FPFMisc0 = Zn3FPP0;
-defvar Zn3FPFMisc1 = Zn3FPP1;
-defvar Zn3FPFMisc2 = Zn3FPP2;
-defvar Zn3FPFMisc3 = Zn3FPP3;
+defvar Zn3FPFMisc0 = Zn3FP0;
+defvar Zn3FPFMisc1 = Zn3FP1;
+defvar Zn3FPFMisc2 = Zn3FP2;
+defvar Zn3FPFMisc3 = Zn3FP3;
 
 // Integer Adds, Subtracts, and Compares
 // Some complex VADD operations are not available in all pipes.
-defvar Zn3FPVAdd0 = Zn3FPP0;
-defvar Zn3FPVAdd1 = Zn3FPP1;
-defvar Zn3FPVAdd2 = Zn3FPP2;
-defvar Zn3FPVAdd3 = Zn3FPP3;
+defvar Zn3FPVAdd0 = Zn3FP0;
+defvar Zn3FPVAdd1 = Zn3FP1;
+defvar Zn3FPVAdd2 = Zn3FP2;
+defvar Zn3FPVAdd3 = Zn3FP3;
 
 // Integer Multiplies, SAD, Blendvb
-defvar Zn3FPVMul0 = Zn3FPP0;
-defvar Zn3FPVMul1 = Zn3FPP3;
+defvar Zn3FPVMul0 = Zn3FP0;
+defvar Zn3FPVMul1 = Zn3FP3;
 
 // Data Shuffles, Packs, Unpacks, Permute
 // Some complex shuffle operations are only available in pipe1.
-defvar Zn3FPVShuf = Zn3FPP1;
-defvar Zn3FPVShufAux = Zn3FPP2;
+defvar Zn3FPVShuf = Zn3FP1;
+defvar Zn3FPVShufAux = Zn3FP2;
 
 // Bit Shift Left/Right operations
-defvar Zn3FPVShift0 = Zn3FPP1;
-defvar Zn3FPVShift1 = Zn3FPP2;
+defvar Zn3FPVShift0 = Zn3FP1;
+defvar Zn3FPVShift1 = Zn3FP2;
 
 // Moves and Logical operations on Packed Integer Data Types
-defvar Zn3FPVMisc0 = Zn3FPP0;
-defvar Zn3FPVMisc1 = Zn3FPP1;
-defvar Zn3FPVMisc2 = Zn3FPP2;
-defvar Zn3FPVMisc3 = Zn3FPP3;
+defvar Zn3FPVMisc0 = Zn3FP0;
+defvar Zn3FPVMisc1 = Zn3FP1;
+defvar Zn3FPVMisc2 = Zn3FP2;
+defvar Zn3FPVMisc3 = Zn3FP3;
 
 // *AES*
-defvar Zn3FPAES0 = Zn3FPP0;
-defvar Zn3FPAES1 = Zn3FPP1;
+defvar Zn3FPAES0 = Zn3FP0;
+defvar Zn3FPAES1 = Zn3FP1;
 
 // *CLM*
-defvar Zn3FPCLM0 = Zn3FPP0;
-defvar Zn3FPCLM1 = Zn3FPP1;
+defvar Zn3FPCLM0 = Zn3FP0;
+defvar Zn3FPCLM1 = Zn3FP1;
 
 // Execution pipeline grouping
 //===----------------------------------------------------------------------===//
@@ -269,7 +269,7 @@ defvar Zn3FPCLM1 = Zn3FPP1;
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // Stores and floating point to general purpose register transfer
 // have 2 dedicated pipelines (pipe 5 and 6).
-def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>;
+def Zn3FPU0123 : ProcResGroup<[Zn3FP0, Zn3FP1, Zn3FP2, Zn3FP3]>;
 
 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
 def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
@@ -293,12 +293,12 @@ def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // Stores and floating point to general purpose register transfer
 // have 2 dedicated pipelines (pipe 5 and 6).
-defvar Zn3FPLd01 = Zn3FPP45;
+defvar Zn3FPLd01 = Zn3FP45;
 
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // Note that FP stores are supported on two pipelines,
 // but throughput is limited to one per cycle.
-let Super = Zn3FPP45 in
+let Super = Zn3FP45 in
 def Zn3FPSt : ProcResource<1>;
 
 // Integer Adds, Subtracts, and Compares
@@ -345,8 +345,8 @@ def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // <...> the scheduler can issue 1 micro op per cycle for each pipe.
 // FIXME: those are two separate schedulers, not a single big one.
-def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2,          /*Zn3FPP4,*/ // scheduler 0
-                          Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/  // scheduler 1
+def Zn3FP : ProcResGroup<[Zn3FP0, Zn3FP2,          /*Zn3FP4,*/ // scheduler 0
+                          Zn3FP1, Zn3FP3, Zn3FP45 /*Zn3FP5*/  // scheduler 1
                          ]> {
   let BufferSize = !mul(2, 32);
 }
@@ -838,9 +838,9 @@ defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
 defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
 
 // Floating point. This covers both scalar and vector operations.
-defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
-defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
-defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
 defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
 defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
 defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 2ec29463d2fa..cd6102914cc3 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2664,9 +2664,9 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   6 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   7 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
diff --git a/llvm/lib/TextAPI/InterfaceFile.cpp b/llvm/lib/TextAPI/InterfaceFile.cpp
index 9979df92674c..79694c90370f 100644
--- a/llvm/lib/TextAPI/InterfaceFile.cpp
+++ b/llvm/lib/TextAPI/InterfaceFile.cpp
@@ -54,7 +54,7 @@ void InterfaceFile::addParentUmbrella(const Target &Target_, StringRef Parent) {
   ParentUmbrellas.emplace(Iter, Target_, std::string(Parent));
 }
 
-void InterfaceFile::addRPath(const Target &InputTarget, StringRef RPath) {
+void InterfaceFile::addRPath(StringRef RPath, const Target &InputTarget) {
   if (RPath.empty())
     return;
   using RPathEntryT = const std::pair<Target, std::string>;
@@ -198,9 +198,9 @@ InterfaceFile::merge(const InterfaceFile *O) const {
       IF->addReexportedLibrary(Lib.getInstallName(), Target);
 
   for (const auto &[Target, Path] : rpaths())
-    IF->addRPath(Target, Path);
+    IF->addRPath(Path, Target);
   for (const auto &[Target, Path] : O->rpaths())
-    IF->addRPath(Target, Path);
+    IF->addRPath(Path, Target);
 
   for (const auto *Sym : symbols()) {
     IF->addSymbol(Sym->getKind(), Sym->getName(), Sym->targets(),
@@ -319,7 +319,7 @@ InterfaceFile::extract(Architecture Arch) const {
 
   for (const auto &It : rpaths())
     if (It.first.Arch == Arch)
-      IF->addRPath(It.first, It.second);
+      IF->addRPath(It.second, It.first);
 
   for (const auto &Lib : allowableClients())
     for (const auto &Target : Lib.targets())
diff --git a/llvm/lib/TextAPI/TextStubV5.cpp b/llvm/lib/TextAPI/TextStubV5.cpp
index d96981035ddd..b072c0b5d69d 100644
--- a/llvm/lib/TextAPI/TextStubV5.cpp
+++ b/llvm/lib/TextAPI/TextStubV5.cpp
@@ -672,7 +672,7 @@ Expected<IFPtr> parseToInterfaceFile(const Object *File) {
       F->addParentUmbrella(Target, Lib);
   for (auto &[Path, Targets] : RPaths)
     for (auto Target : Targets)
-      F->addRPath(Target, Path);
+      F->addRPath(Path, Target);
   for (auto &[Targets, Symbols] : Exports)
     for (auto &Sym : Symbols)
       F->addSymbol(Sym.Kind, Sym.Name, Targets, Sym.Flags);
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 3986359b6a5a..4df18c824927 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -583,10 +583,8 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
 
   // RemoveDIs: there's no bitcode representation of the DbgVariableRecord
   // debug-info, convert to dbg.values before writing out.
-  bool ConvertToOldDbgFormatForWrite =
-      M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat &&
+                                                WriteNewDbgInfoFormatToBitcode);
 
   bool Changed = writeThinLTOBitcode(
       OS, ThinLinkOS,
@@ -595,8 +593,5 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
       },
       M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
 
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertToNewDbgValues();
-
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index d0d349c891a3..ad1cd9c1f6bf 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -182,18 +182,11 @@ static cl::opt<bool> ClWithTls(
              "platforms that support this"),
     cl::Hidden, cl::init(true));
 
-static cl::opt<bool>
-    CSelectiveInstrumentation("hwasan-selective-instrumentation",
-                              cl::desc("Use selective instrumentation"),
-                              cl::Hidden, cl::init(false));
-
-static cl::opt<int> ClHotPercentileCutoff(
-    "hwasan-percentile-cutoff-hot", cl::init(0),
-    cl::desc("Alternative hot percentile cuttoff."
-             "By default `-profile-summary-cutoff-hot` is used."));
+static cl::opt<int> ClHotPercentileCutoff("hwasan-percentile-cutoff-hot",
+                                          cl::desc("Hot percentile cuttoff."));
 
 static cl::opt<float>
-    ClRandomSkipRate("hwasan-random-skip-rate", cl::init(0),
+    ClRandomSkipRate("hwasan-random-skip-rate",
                      cl::desc("Probability value in the range [0.0, 1.0] "
                               "to skip instrumentation of a function."));
 
@@ -317,7 +310,7 @@ private:
   };
 
   bool selectiveInstrumentationShouldSkip(Function &F,
-                                          FunctionAnalysisManager &FAM);
+                                          FunctionAnalysisManager &FAM) const;
   void initializeModule();
   void createHwasanCtorComdat();
 
@@ -1500,28 +1493,22 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
 }
 
 bool HWAddressSanitizer::selectiveInstrumentationShouldSkip(
-    Function &F, FunctionAnalysisManager &FAM) {
+    Function &F, FunctionAnalysisManager &FAM) const {
   if (ClRandomSkipRate.getNumOccurrences()) {
     std::bernoulli_distribution D(ClRandomSkipRate);
-    if (D(*Rng))
-      return true;
-  } else {
-    auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
-    ProfileSummaryInfo *PSI =
-        MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
-    if (PSI && PSI->hasProfileSummary()) {
-      auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
-      if ((ClHotPercentileCutoff.getNumOccurrences() &&
-           ClHotPercentileCutoff >= 0)
-              ? PSI->isFunctionHotInCallGraphNthPercentile(
-                    ClHotPercentileCutoff, &F, BFI)
-              : PSI->isFunctionHotInCallGraph(&F, BFI))
-        return true;
-    } else {
-      ++NumNoProfileSummaryFuncs;
-    }
+    return (D(*Rng));
   }
-  return false;
+  if (!ClHotPercentileCutoff.getNumOccurrences())
+    return false;
+  auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  ProfileSummaryInfo *PSI =
+      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  if (!PSI || !PSI->hasProfileSummary()) {
+    ++NumNoProfileSummaryFuncs;
+    return false;
+  }
+  return PSI->isFunctionHotInCallGraphNthPercentile(
+      ClHotPercentileCutoff, &F, FAM.getResult<BlockFrequencyAnalysis>(F));
 }
 
 void HWAddressSanitizer::sanitizeFunction(Function &F,
@@ -1537,7 +1524,7 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   NumTotalFuncs++;
 
-  if (CSelectiveInstrumentation && selectiveInstrumentationShouldSkip(F, FAM))
+  if (selectiveInstrumentationShouldSkip(F, FAM))
     return;
 
   NumInstrumentedFuncs++;
diff --git a/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp b/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp
index d87f7482a21d..6adc29f8572b 100644
--- a/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
@@ -22,13 +23,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "remove-traps"
 
-static cl::opt<int> HotPercentileCutoff(
-    "remove-traps-percentile-cutoff-hot", cl::init(0),
-    cl::desc("Alternative hot percentile cuttoff. By default "
-             "`-profile-summary-cutoff-hot` is used."));
+static cl::opt<int> HotPercentileCutoff("remove-traps-percentile-cutoff-hot",
+                                        cl::desc("Hot percentile cuttoff."));
 
 static cl::opt<float>
-    RandomRate("remove-traps-random-rate", cl::init(0.0),
+    RandomRate("remove-traps-random-rate",
                cl::desc("Probability value in the range [0.0, 1.0] of "
                         "unconditional pseudo-random checks removal."));
 
@@ -37,9 +36,11 @@ STATISTIC(NumChecksRemoved, "Number of removed checks");
 
 static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
                              const ProfileSummaryInfo *PSI) {
-  SmallVector<IntrinsicInst *, 16> Remove;
+  SmallVector<std::pair<IntrinsicInst *, bool>, 16> ReplaceWithValue;
   std::unique_ptr<RandomNumberGenerator> Rng;
 
+  // TODO:
+  // https://github.com/llvm/llvm-project/pull/84858#discussion_r1520603139
   auto ShouldRemove = [&](bool IsHot) {
     if (!RandomRate.getNumOccurrences())
       return IsHot;
@@ -56,26 +57,23 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
         continue;
       auto ID = II->getIntrinsicID();
       switch (ID) {
-      case Intrinsic::ubsantrap: {
+      case Intrinsic::allow_ubsan_check:
+      case Intrinsic::allow_runtime_check: {
         ++NumChecksTotal;
 
         bool IsHot = false;
         if (PSI) {
-          uint64_t Count = 0;
-          for (const auto *PR : predecessors(&BB))
-            Count += BFI.getBlockProfileCount(PR).value_or(0);
-
-          IsHot =
-              HotPercentileCutoff.getNumOccurrences()
-                  ? (HotPercentileCutoff > 0 &&
-                     PSI->isHotCountNthPercentile(HotPercentileCutoff, Count))
-                  : PSI->isHotCount(Count);
+          uint64_t Count = BFI.getBlockProfileCount(&BB).value_or(0);
+          IsHot = PSI->isHotCountNthPercentile(HotPercentileCutoff, Count);
         }
 
-        if (ShouldRemove(IsHot)) {
-          Remove.push_back(II);
+        bool ToRemove = ShouldRemove(IsHot);
+        ReplaceWithValue.push_back({
+            II,
+            ToRemove,
+        });
+        if (ToRemove)
           ++NumChecksRemoved;
-        }
         break;
       }
       default:
@@ -84,10 +82,12 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
     }
   }
 
-  for (IntrinsicInst *I : Remove)
+  for (auto [I, V] : ReplaceWithValue) {
+    I->replaceAllUsesWith(ConstantInt::getBool(I->getType(), !V));
     I->eraseFromParent();
+  }
 
-  return !Remove.empty();
+  return !ReplaceWithValue.empty();
 }
 
 PreservedAnalyses RemoveTrapsPass::run(Function &F,
@@ -102,3 +102,8 @@ PreservedAnalyses RemoveTrapsPass::run(Function &F,
   return removeUbsanTraps(F, BFI, PSI) ? PreservedAnalyses::none()
                                        : PreservedAnalyses::all();
 }
+
+bool RemoveTrapsPass::IsRequested() {
+  return RandomRate.getNumOccurrences() ||
+         HotPercentileCutoff.getNumOccurrences();
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0834865173b2..cb0fd06554e6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -124,6 +124,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/VectorBuilder.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -248,10 +249,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
                    "Create lane mask using active.lane.mask intrinsic, and use "
                    "it for both data and control flow"),
-        clEnumValN(
-            TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
-            "data-and-control-without-rt-check",
-            "Similar to data-and-control, but remove the runtime check")));
+        clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
+                   "data-and-control-without-rt-check",
+                   "Similar to data-and-control, but remove the runtime check"),
+        clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
+                   "Use predicated EVL instructions for tail folding. If EVL "
+                   "is unsupported, fallback to data-without-lane-mask.")));
 
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -1505,29 +1508,62 @@ public:
 
   /// Returns the TailFoldingStyle that is best for the current loop.
   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
-    return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first
-                               : ChosenTailFoldingStyle.second;
+    if (!ChosenTailFoldingStyle)
+      return TailFoldingStyle::None;
+    return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
+                               : ChosenTailFoldingStyle->second;
   }
 
   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
   /// overflow or not.
-  void setTailFoldingStyles() {
-    assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None &&
-           ChosenTailFoldingStyle.second == TailFoldingStyle::None &&
-           "Tail folding must not be selected yet.");
-    if (!Legal->prepareToFoldTailByMasking())
+  /// \param IsScalableVF true if scalable vector factors enabled.
+  /// \param UserIC User specific interleave count.
+  void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+    assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
+    if (!Legal->prepareToFoldTailByMasking()) {
+      ChosenTailFoldingStyle =
+          std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
       return;
+    }
 
-    if (ForceTailFoldingStyle.getNumOccurrences()) {
-      ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second =
-          ForceTailFoldingStyle;
+    if (!ForceTailFoldingStyle.getNumOccurrences()) {
+      ChosenTailFoldingStyle = std::make_pair(
+          TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
+          TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
       return;
     }
 
-    ChosenTailFoldingStyle.first =
-        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true);
-    ChosenTailFoldingStyle.second =
-        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false);
+    // Set styles when forced.
+    ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
+                                            ForceTailFoldingStyle.getValue());
+    if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
+      return;
+    // Override forced styles if needed.
+    // FIXME: use actual opcode/data type for analysis here.
+    // FIXME: Investigate opportunity for fixed vector factor.
+    bool EVLIsLegal =
+        IsScalableVF && UserIC <= 1 &&
+        TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+        !EnableVPlanNativePath &&
+        // FIXME: implement support for max safe dependency distance.
+        Legal->isSafeForAnyVectorWidth() &&
+        // FIXME: remove this once reductions are supported.
+        Legal->getReductionVars().empty();
+    if (!EVLIsLegal) {
+      // If for some reason EVL mode is unsupported, fallback to
+      // DataWithoutLaneMask to try to vectorize the loop with folded tail
+      // in a generic way.
+      ChosenTailFoldingStyle =
+          std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
+                         TailFoldingStyle::DataWithoutLaneMask);
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: Preference for VP intrinsics indicated. Will "
+             "not try to generate VP Intrinsics "
+          << (UserIC > 1
+                  ? "since interleave count specified is greater than 1.\n"
+                  : "due to non-interleaving reasons.\n"));
+    }
   }
 
   /// Returns true if all loop blocks should be masked to fold tail loop.
@@ -1544,6 +1580,18 @@ public:
     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
+  /// Returns true if VP intrinsics with explicit vector length support should
+  /// be generated in the tail folded loop.
+  bool foldTailWithEVL() const {
+    return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL &&
+           // FIXME: remove this once vp_reverse is supported.
+           none_of(
+               WideningDecisions,
+               [](const std::pair<std::pair<Instruction *, ElementCount>,
+                                  std::pair<InstWidening, InstructionCost>>
+                      &Data) { return Data.second.first == CM_Widen_Reverse; });
+  }
+
   /// Returns true if the Phi is part of an inloop reduction.
   bool isInLoopReduction(PHINode *Phi) const {
     return InLoopReductions.contains(Phi);
@@ -1688,8 +1736,8 @@ private:
 
   /// Control finally chosen tail folding style. The first element is used if
   /// the IV update may overflow, the second element - if it does not.
-  std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle =
-      std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+  std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
+      ChosenTailFoldingStyle;
 
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
@@ -4647,9 +4695,24 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  setTailFoldingStyles();
-  if (foldTailByMasking())
+  setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
+  if (foldTailByMasking()) {
+    if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
+             "try to generate VP Intrinsics with scalable vector "
+             "factors only.\n");
+      // Tail folded loop using VP intrinsics restricts the VF to be scalable
+      // for now.
+      // TODO: extend it for fixed vectors, if required.
+      assert(MaxFactors.ScalableVF.isScalable() &&
+             "Expected scalable vector factor.");
+
+      MaxFactors.FixedVF = ElementCount::getFixed(1);
+    }
     return MaxFactors;
+  }
 
   // If there was a tail-folding hint/switch, but we can't fold the tail by
   // masking, fallback to a vectorization with a scalar epilogue.
@@ -5257,6 +5320,13 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   if (!isScalarEpilogueAllowed())
     return 1;
 
+  // Do not interleave if EVL is preferred and no User IC is specified.
+  if (foldTailWithEVL()) {
+    LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
+                         "Unroll factor forced to be 1.\n");
+    return 1;
+  }
+
   // We used the distance for the interleave count.
   if (!Legal->isSafeForAnyVectorWidth())
     return 1;
@@ -8487,6 +8557,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
         VPlanTransforms::truncateToMinimalBitwidths(
             *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
       VPlanTransforms::optimize(*Plan, *PSE.getSE());
+      // TODO: try to put it close to addActiveLaneMask().
+      if (CM.foldTailWithEVL())
+        VPlanTransforms::addExplicitVectorLength(*Plan);
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
@@ -9179,7 +9252,7 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
 
   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
-  Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
+  Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0));
   Value *DerivedIV = emitTransformedIndex(
       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
       Kind, cast_if_present<BinaryOperator>(FPBinOp));
@@ -9307,6 +9380,52 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
 }
 
+/// Creates either vp_store or vp_scatter intrinsics calls to represent
+/// predicated store/scatter.
+static Instruction *
+lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
+                                Value *StoredVal, bool IsScatter, Value *Mask,
+                                Value *EVL, const Align &Alignment) {
+  CallInst *Call;
+  if (IsScatter) {
+    Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
+                                   Intrinsic::vp_scatter,
+                                   {StoredVal, Addr, Mask, EVL});
+  } else {
+    VectorBuilder VBuilder(Builder);
+    VBuilder.setEVL(EVL).setMask(Mask);
+    Call = cast<CallInst>(VBuilder.createVectorInstruction(
+        Instruction::Store, Type::getVoidTy(EVL->getContext()),
+        {StoredVal, Addr}));
+  }
+  Call->addParamAttr(
+      1, Attribute::getWithAlignment(Call->getContext(), Alignment));
+  return Call;
+}
+
+/// Creates either vp_load or vp_gather intrinsics calls to represent
+/// predicated load/gather.
+static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
+                                                   VectorType *DataTy,
+                                                   Value *Addr, bool IsGather,
+                                                   Value *Mask, Value *EVL,
+                                                   const Align &Alignment) {
+  CallInst *Call;
+  if (IsGather) {
+    Call =
+        Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
+                                nullptr, "wide.masked.gather");
+  } else {
+    VectorBuilder VBuilder(Builder);
+    VBuilder.setEVL(EVL).setMask(Mask);
+    Call = cast<CallInst>(VBuilder.createVectorInstruction(
+        Instruction::Load, DataTy, Addr, "vp.op.load"));
+  }
+  Call->addParamAttr(
+      0, Attribute::getWithAlignment(Call->getContext(), Alignment));
+  return Call;
+}
+
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
 
@@ -9345,7 +9464,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Instruction *NewSI = nullptr;
       Value *StoredVal = State.get(StoredValue, Part);
-      if (CreateGatherScatter) {
+      // TODO: split this into several classes for better design.
+      if (State.EVL) {
+        assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+                                "explicit vector length.");
+        assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
+                   VPInstruction::ExplicitVectorLength &&
+               "EVL must be VPInstruction::ExplicitVectorLength.");
+        Value *EVL = State.get(State.EVL, VPIteration(0, 0));
+        // If EVL is not nullptr, then EVL must be a valid value set during plan
+        // creation, possibly default value = whole vector register length. EVL
+        // is created only if TTI prefers predicated vectorization, thus if EVL
+        // is not nullptr it also implies preference for predicated
+        // vectorization.
+        // FIXME: Support reverse store after vp_reverse is added.
+        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        NewSI = lowerStoreUsingVectorIntrinsics(
+            Builder, State.get(getAddr(), Part, !CreateGatherScatter),
+            StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment);
+      } else if (CreateGatherScatter) {
         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
         Value *VectorGep = State.get(getAddr(), Part);
         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
@@ -9375,7 +9512,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *NewLI;
-    if (CreateGatherScatter) {
+    // TODO: split this into several classes for better design.
+    if (State.EVL) {
+      assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+                              "explicit vector length.");
+      assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
+                 VPInstruction::ExplicitVectorLength &&
+             "EVL must be VPInstruction::ExplicitVectorLength.");
+      Value *EVL = State.get(State.EVL, VPIteration(0, 0));
+      // If EVL is not nullptr, then EVL must be a valid value set during plan
+      // creation, possibly default value = whole vector register length. EVL
+      // is created only if TTI prefers predicated vectorization, thus if EVL
+      // is not nullptr it also implies preference for predicated
+      // vectorization.
+      // FIXME: Support reverse loading after vp_reverse is added.
+      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+      NewLI = lowerLoadUsingVectorIntrinsics(
+          Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter),
+          CreateGatherScatter, MaskPart, EVL, Alignment);
+    } else if (CreateGatherScatter) {
       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
       Value *VectorGep = State.get(getAddr(), Part);
       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 99769540f780..bdd26acfd2f8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1973,7 +1973,7 @@ public:
       assert(isa<Instruction>(VL[0]) && "Expected instruction");
       unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
       constexpr unsigned IntrinsicNumOperands = 2;
-      if (auto *CI = dyn_cast<IntrinsicInst>(VL[0]))
+      if (isa<IntrinsicInst>(VL[0]))
         NumOperands = IntrinsicNumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
@@ -14141,6 +14141,16 @@ bool BoUpSLP::collectValuesToDemote(
       }))
     return FinalAnalysis();
 
+  if (!all_of(I->users(),
+              [=](User *U) {
+                return getTreeEntry(U) ||
+                       (UserIgnoreList && UserIgnoreList->contains(U)) ||
+                       (U->getType()->isSized() &&
+                        DL->getTypeSizeInBits(U->getType()) <= BitWidth);
+              }) &&
+      !IsPotentiallyTruncated(I, BitWidth))
+    return false;
+
   unsigned Start = 0;
   unsigned End = I->getNumOperands();
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f0b7008992d7..8ebd75da3465 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -871,13 +871,15 @@ void VPlan::execute(VPTransformState *State) {
     // only a single part is generated, which provides the last part from the
     // previous iteration. For non-ordered reductions all UF parts are
     // generated.
-    bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
-                            isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
-                            (isa<VPReductionPHIRecipe>(PhiR) &&
-                             cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
-    bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
-                       (isa<VPReductionPHIRecipe>(PhiR) &&
-                        cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
+    bool SinglePartNeeded =
+        isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+        isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+        (isa<VPReductionPHIRecipe>(PhiR) &&
+         cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
+    bool NeedsScalar =
+        isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+        (isa<VPReductionPHIRecipe>(PhiR) &&
+         cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
     unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
 
     for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 813ebda29ffd..77577b516ae2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -242,6 +242,15 @@ struct VPTransformState {
   ElementCount VF;
   unsigned UF;
 
+  /// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid
+  /// value set during plan transformation, possibly a default value = whole
+  /// vector register length. EVL is created only if TTI prefers predicated
+  /// vectorization, thus if EVL is not nullptr it also implies preference for
+  /// predicated vectorization.
+  /// TODO: this is a temporarily solution, the EVL must be explicitly used by
+  /// the recipes and must be removed here.
+  VPValue *EVL = nullptr;
+
   /// Hold the indices to generate specific scalar instructions. Null indicates
   /// that all instances are to be generated, using either scalar or vector
   /// instructions.
@@ -1159,6 +1168,7 @@ public:
     SLPLoad,
     SLPStore,
     ActiveLaneMask,
+    ExplicitVectorLength,
     CalculateTripCountMinusVF,
     // Increment the canonical IV separately for each unrolled part.
     CanonicalIVIncrementForPart,
@@ -2489,6 +2499,45 @@ public:
 #endif
 };
 
+/// A recipe for generating the phi node for the current index of elements,
+/// adjusted in accordance with EVL value. It starts at the start value of the
+/// canonical induction and gets incremented by EVL in each iteration of the
+/// vector loop.
+class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
+public:
+  VPEVLBasedIVPHIRecipe(VPValue *StartIV, DebugLoc DL)
+      : VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartIV, DL) {}
+
+  ~VPEVLBasedIVPHIRecipe() override = default;
+
+  VPEVLBasedIVPHIRecipe *clone() override {
+    llvm_unreachable("cloning not implemented yet");
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC)
+
+  static inline bool classof(const VPHeaderPHIRecipe *D) {
+    return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC;
+  }
+
+  /// Generate phi for handling IV based on EVL over iterations correctly.
+  /// TODO: investigate if it can share the code with VPCanonicalIVPHIRecipe.
+  void execute(VPTransformState &State) override;
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A Recipe for widening the canonical induction variable of the vector loop.
 class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
 public:
@@ -2522,8 +2571,8 @@ public:
   }
 };
 
-/// A recipe for converting the canonical IV value to the corresponding value of
-/// an IV with different start and step values, using Start + CanonicalIV *
+/// A recipe for converting the input value \p IV value to the corresponding
+/// value of an IV with different start and step values, using Start + IV *
 /// Step.
 class VPDerivedIVRecipe : public VPSingleDefRecipe {
   /// Kind of the induction.
@@ -2541,16 +2590,16 @@ public:
             Start, CanonicalIV, Step) {}
 
   VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind,
-                    const FPMathOperator *FPBinOp, VPValue *Start,
-                    VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step)
-      : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
-        Kind(Kind), FPBinOp(FPBinOp) {}
+                    const FPMathOperator *FPBinOp, VPValue *Start, VPValue *IV,
+                    VPValue *Step)
+      : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, IV, Step}), Kind(Kind),
+        FPBinOp(FPBinOp) {}
 
   ~VPDerivedIVRecipe() override = default;
 
   VPRecipeBase *clone() override {
-    return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(),
-                                 getCanonicalIV(), getStepValue());
+    return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(), getOperand(1),
+                                 getStepValue());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC)
@@ -2570,9 +2619,6 @@ public:
   }
 
   VPValue *getStartValue() const { return getOperand(0); }
-  VPCanonicalIVPHIRecipe *getCanonicalIV() const {
-    return cast<VPCanonicalIVPHIRecipe>(getOperand(1));
-  }
   VPValue *getStepValue() const { return getOperand(2); }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 04e30312dc23..c8ae2ee5a30f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -216,14 +216,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
   Type *ResultTy =
       TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
           .Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
-                VPReductionPHIRecipe, VPWidenPointerInductionRecipe>(
-              [this](const auto *R) {
-                // Handle header phi recipes, except VPWienIntOrFpInduction
-                // which needs special handling due it being possibly truncated.
-                // TODO: consider inferring/caching type of siblings, e.g.,
-                // backedge value, here and in cases below.
-                return inferScalarType(R->getStartValue());
-              })
+                VPReductionPHIRecipe, VPWidenPointerInductionRecipe,
+                VPEVLBasedIVPHIRecipe>([this](const auto *R) {
+            // Handle header phi recipes, except VPWidenIntOrFpInduction
+            // which needs special handling due it being possibly truncated.
+            // TODO: consider inferring/caching type of siblings, e.g.,
+            // backedge value, here and in cases below.
+            return inferScalarType(R->getStartValue());
+          })
           .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
               [](const auto *R) { return R->getScalarType(); })
           .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 124ae3108d8a..1be0287ce7c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -286,6 +286,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ComputeReductionResult:
   case VPInstruction::PtrAdd:
+  case VPInstruction::ExplicitVectorLength:
     return true;
   default:
     return false;
@@ -386,6 +387,33 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
     return Builder.CreateSelect(Cmp, Sub, Zero);
   }
+  case VPInstruction::ExplicitVectorLength: {
+    // Compute EVL
+    auto GetEVL = [=](VPTransformState &State, Value *AVL) {
+      assert(AVL->getType()->isIntegerTy() &&
+             "Requested vector length should be an integer.");
+
+      // TODO: Add support for MaxSafeDist for correct loop emission.
+      assert(State.VF.isScalable() && "Expected scalable vector factor.");
+      Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
+
+      Value *EVL = State.Builder.CreateIntrinsic(
+          State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
+          {AVL, VFArg, State.Builder.getTrue()});
+      return EVL;
+    };
+    // TODO: Restructure this code with an explicit remainder loop, vsetvli can
+    // be outside of the main loop.
+    assert(Part == 0 && "No unrolling expected for predicated vectorization.");
+    // Compute VTC - IV as the AVL (requested vector length).
+    Value *Index = State.get(getOperand(0), VPIteration(0, 0));
+    Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));
+    Value *AVL = State.Builder.CreateSub(TripCount, Index);
+    Value *EVL = GetEVL(State, AVL);
+    assert(!State.EVL && "multiple EVL recipes");
+    State.EVL = this;
+    return EVL;
+  }
   case VPInstruction::CanonicalIVIncrementForPart: {
     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
     if (Part == 0)
@@ -592,6 +620,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
     // TODO: Cover additional opcodes.
     return vputils::onlyFirstLaneUsed(this);
   case VPInstruction::ActiveLaneMask:
+  case VPInstruction::ExplicitVectorLength:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::BranchOnCount:
@@ -628,6 +657,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ActiveLaneMask:
     O << "active lane mask";
     break;
+  case VPInstruction::ExplicitVectorLength:
+    O << "EXPLICIT-VECTOR-LENGTH";
+    break;
   case VPInstruction::FirstOrderRecurrenceSplice:
     O << "first-order splice";
     break;
@@ -1184,7 +1216,7 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
   O << Indent << "= DERIVED-IV ";
   getStartValue()->printAsOperand(O, SlotTracker);
   O << " + ";
-  getCanonicalIV()->printAsOperand(O, SlotTracker);
+  getOperand(1)->printAsOperand(O, SlotTracker);
   O << " * ";
   getStepValue()->printAsOperand(O, SlotTracker);
 }
@@ -1974,3 +2006,25 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printOperands(O, SlotTracker);
 }
 #endif
+
+void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
+  Value *Start = State.get(getOperand(0), VPIteration(0, 0));
+  PHINode *EntryPart =
+      State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
+  EntryPart->addIncoming(Start, VectorPH);
+  EntryPart->setDebugLoc(getDebugLoc());
+  State.set(this, EntryPart, 0, /*IsScalar=*/true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
+
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3753060cd6ec..1256e4d8fda5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -641,6 +641,25 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
   }
 }
 
+static void recursivelyDeleteDeadRecipes(VPValue *V) {
+  SmallVector<VPValue *> WorkList;
+  SmallPtrSet<VPValue *, 8> Seen;
+  WorkList.push_back(V);
+
+  while (!WorkList.empty()) {
+    VPValue *Cur = WorkList.pop_back_val();
+    if (!Seen.insert(Cur).second)
+      continue;
+    VPRecipeBase *R = Cur->getDefiningRecipe();
+    if (!R)
+      continue;
+    if (!isDeadRecipe(*R))
+      continue;
+    WorkList.append(R->op_begin(), R->op_end());
+    R->eraseFromParent();
+  }
+}
+
 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                          unsigned BestUF,
                                          PredicatedScalarEvolution &PSE) {
@@ -674,7 +693,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
   auto *BOC =
       new VPInstruction(VPInstruction::BranchOnCond,
                         {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))});
+
+  SmallVector<VPValue *> PossiblyDead(Term->operands());
   Term->eraseFromParent();
+  for (VPValue *Op : PossiblyDead)
+    recursivelyDeleteDeadRecipes(Op);
   ExitingVPBB->appendRecipe(BOC);
   Plan.setVF(BestVF);
   Plan.setUF(BestUF);
@@ -1186,6 +1209,45 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   return LaneMaskPhi;
 }
 
+/// Replaces (ICMP_ULE, WideCanonicalIV, backedge-taken-count) pattern using
+/// the given \p Idiom.
+static void
+replaceHeaderPredicateWith(VPlan &Plan, VPValue &Idiom,
+                           function_ref<bool(VPUser &, unsigned)> Cond = {}) {
+  auto *FoundWidenCanonicalIVUser =
+      find_if(Plan.getCanonicalIV()->users(),
+              [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); });
+  if (FoundWidenCanonicalIVUser == Plan.getCanonicalIV()->users().end())
+    return;
+  auto *WideCanonicalIV =
+      cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
+  // Walk users of WideCanonicalIV and replace all compares of the form
+  // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with
+  // the given idiom VPValue.
+  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+  for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
+    auto *CompareToReplace = dyn_cast<VPInstruction>(U);
+    if (!CompareToReplace ||
+        CompareToReplace->getOpcode() != Instruction::ICmp ||
+        CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
+        CompareToReplace->getOperand(1) != BTC)
+      continue;
+
+    assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
+           "WidenCanonicalIV must be the first operand of the compare");
+    if (Cond) {
+      CompareToReplace->replaceUsesWithIf(&Idiom, Cond);
+      if (!CompareToReplace->getNumUsers())
+        CompareToReplace->eraseFromParent();
+    } else {
+      CompareToReplace->replaceAllUsesWith(&Idiom);
+      CompareToReplace->eraseFromParent();
+    }
+  }
+  if (!WideCanonicalIV->getNumUsers())
+    WideCanonicalIV->eraseFromParent();
+}
+
 void VPlanTransforms::addActiveLaneMask(
     VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
     bool DataAndControlFlowWithoutRuntimeCheck) {
@@ -1215,20 +1277,77 @@ void VPlanTransforms::addActiveLaneMask(
   // Walk users of WideCanonicalIV and replace all compares of the form
   // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
   // active-lane-mask.
-  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
-  for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) {
-    auto *CompareToReplace = dyn_cast<VPInstruction>(U);
-    if (!CompareToReplace ||
-        CompareToReplace->getOpcode() != Instruction::ICmp ||
-        CompareToReplace->getPredicate() != CmpInst::ICMP_ULE ||
-        CompareToReplace->getOperand(1) != BTC)
-      continue;
+  replaceHeaderPredicateWith(Plan, *LaneMask);
+}
 
-    assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
-           "WidenCanonicalIV must be the first operand of the compare");
-    CompareToReplace->replaceAllUsesWith(LaneMask);
-    CompareToReplace->eraseFromParent();
+/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
+/// replaces all uses except the canonical IV increment of
+/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
+/// is used only for loop iterations counting after this transformation.
+///
+/// The function uses the following definitions:
+///  %StartV is the canonical induction start value.
+///
+/// The function adds the following recipes:
+///
+/// vector.ph:
+/// ...
+///
+/// vector.body:
+/// ...
+/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
+///                                               [ %NextEVLIV, %vector.body ]
+/// %VPEVL = EXPLICIT-VECTOR-LENGTH %EVLPhi, original TC
+/// ...
+/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
+/// ...
+///
+void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
+  VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  auto *CanonicalIVPHI = Plan.getCanonicalIV();
+  VPValue *StartV = CanonicalIVPHI->getStartValue();
+
+  // TODO: revisit this and try to remove the mask operand.
+  // Walk VPWidenMemoryInstructionRecipe users of WideCanonicalIV and replace
+  // all compares of the form (ICMP_ULE, WideCanonicalIV, backedge-taken-count),
+  // used as mask in VPWidenMemoryInstructionRecipe, with an all-true-mask.
+  Value *TrueMask =
+      ConstantInt::getTrue(CanonicalIVPHI->getScalarType()->getContext());
+  VPValue *VPTrueMask = Plan.getOrAddLiveIn(TrueMask);
+  replaceHeaderPredicateWith(Plan, *VPTrueMask, [](VPUser &U, unsigned) {
+    return isa<VPWidenMemoryInstructionRecipe>(U);
+  });
+  // Now create the ExplicitVectorLengthPhi recipe in the main loop.
+  auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
+  EVLPhi->insertAfter(CanonicalIVPHI);
+  auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
+                                  {EVLPhi, Plan.getTripCount()});
+  VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
+
+  auto *CanonicalIVIncrement =
+      cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
+  VPSingleDefRecipe *OpVPEVL = VPEVL;
+  if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
+      IVSize != 32) {
+    OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc
+                                                 : Instruction::ZExt,
+                                     OpVPEVL, CanonicalIVPHI->getScalarType());
+    OpVPEVL->insertBefore(CanonicalIVIncrement);
   }
+  auto *NextEVLIV =
+      new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi},
+                        {CanonicalIVIncrement->hasNoUnsignedWrap(),
+                         CanonicalIVIncrement->hasNoSignedWrap()},
+                        CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
+  NextEVLIV->insertBefore(CanonicalIVIncrement);
+  EVLPhi->addOperand(NextEVLIV);
+
+  // Replace all uses of VPCanonicalIVPHIRecipe by
+  // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
+  CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
+  CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
+  // TODO: support unroll factor > 1.
+  Plan.setUF(1);
 }
 
 void VPlanTransforms::dropPoisonGeneratingRecipes(
@@ -1254,9 +1373,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
       // handled.
       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
           isa<VPInterleaveRecipe>(CurRec) ||
-          isa<VPScalarIVStepsRecipe>(CurRec) ||
-          isa<VPCanonicalIVPHIRecipe>(CurRec) ||
-          isa<VPActiveLaneMaskPHIRecipe>(CurRec))
+          isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec))
         continue;
 
       // This recipe contributes to the address computation of a widen
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ff83c3f083b0..0cbc70713d9c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -98,6 +98,13 @@ struct VPlanTransforms {
   ///       VPlan directly.
   static void dropPoisonGeneratingRecipes(
       VPlan &Plan, function_ref<bool(BasicBlock *)> BlockNeedsPredication);
+
+  /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
+  /// replaces all uses except the canonical IV increment of
+  /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
+  /// VPCanonicalIVPHIRecipe is only used to control the loop after
+  /// this transformation.
+  static void addExplicitVectorLength(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 1d2c17e91b7a..8b221d30e525 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -368,6 +368,7 @@ public:
     // VPHeaderPHIRecipe need to be kept together.
     VPCanonicalIVPHISC,
     VPActiveLaneMaskPHISC,
+    VPEVLBasedIVPHISC,
     VPFirstOrderRecurrencePHISC,
     VPWidenIntOrFpInductionSC,
     VPWidenPointerInductionSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7ebdb914fb85..12d37fa711db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -92,7 +92,50 @@ static bool verifyVPBasicBlock(const VPBasicBlock *VPBB,
   for (const VPRecipeBase &R : *VPBB)
     RecipeNumbering[&R] = Cnt++;
 
+  // Set of recipe types along with VPInstruction Opcodes of all EVL-related
+  // recipes that must appear at most once in the header block.
+  DenseSet<unsigned> EVLFound;
+  const VPRecipeBase *VPWidenMemRecipe = nullptr;
+  const VPlan *Plan = VPBB->getPlan();
+  bool IsHeader = Plan->getEntry()->getNumSuccessors() == 1 &&
+                  Plan->getVectorLoopRegion()->getEntry() == VPBB;
+  auto CheckEVLRecipiesInsts = [&](const VPRecipeBase *R) {
+    if (isa<VPEVLBasedIVPHIRecipe>(R)) {
+      if (!IsHeader) {
+        errs() << "EVL PHI recipe not in entry block!\n";
+        return false;
+      }
+      if (!EVLFound.insert(VPDef::VPEVLBasedIVPHISC).second) {
+        errs() << "EVL PHI recipe inserted more than once!\n";
+        return false;
+      }
+      return true;
+    }
+    if (const auto *RInst = dyn_cast<VPInstruction>(R);
+        RInst && RInst->getOpcode() == VPInstruction::ExplicitVectorLength) {
+      if (!IsHeader) {
+        errs() << "EVL instruction not in the header block!\n";
+        return false;
+      }
+      if (!EVLFound.insert(RInst->getOpcode() + VPDef::VPLastPHISC).second) {
+        errs() << "EVL instruction inserted more than once!\n";
+        return false;
+      }
+      if (VPWidenMemRecipe) {
+        errs() << "Use of EVL instruction by widen memory recipe before "
+                  "definition!\n";
+        return false;
+      }
+      return true;
+    }
+    if (isa<VPWidenMemoryInstructionRecipe>(R))
+      VPWidenMemRecipe = R;
+    return true;
+  };
+
   for (const VPRecipeBase &R : *VPBB) {
+    if (!CheckEVLRecipiesInsts(&R))
+      return false;
     for (const VPValue *V : R.definedValues()) {
       for (const VPUser *U : V->users()) {
         auto *UI = dyn_cast<VPRecipeBase>(U);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index af5e7c9bc385..3738220b4f81 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -112,6 +112,7 @@ private:
   bool foldSingleElementStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
   bool foldShuffleOfBinops(Instruction &I);
+  bool foldShuffleOfCastops(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
   bool foldTruncFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -1432,6 +1433,75 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
   return true;
 }
 
+/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
+/// into "castop (shuffle)".
+bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
+  Value *V0, *V1;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
+                           m_Mask(Mask))))
+    return false;
+
+  auto *C0 = dyn_cast<CastInst>(V0);
+  auto *C1 = dyn_cast<CastInst>(V1);
+  if (!C0 || !C1)
+    return false;
+
+  Instruction::CastOps Opcode = C0->getOpcode();
+  if (Opcode == Instruction::BitCast || C0->getSrcTy() != C1->getSrcTy())
+    return false;
+
+  // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
+  if (Opcode != C1->getOpcode()) {
+    if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
+      Opcode = Instruction::SExt;
+    else
+      return false;
+  }
+
+  auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
+  auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
+  auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
+  if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
+    return false;
+  assert(CastDstTy->getElementCount() == CastSrcTy->getElementCount() &&
+         "Unexpected src/dst element counts");
+
+  auto *NewShuffleDstTy =
+      FixedVectorType::get(CastSrcTy->getScalarType(), Mask.size());
+
+  // Try to replace a castop with a shuffle if the shuffle is not costly.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  InstructionCost OldCost =
+      TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
+                           TTI::CastContextHint::None, CostKind) +
+      TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
+                           TTI::CastContextHint::None, CostKind);
+  OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+                                CastDstTy, Mask, CostKind);
+
+  InstructionCost NewCost = TTI.getShuffleCost(
+      TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, Mask, CostKind);
+  NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
+                                  TTI::CastContextHint::None, CostKind);
+  if (NewCost > OldCost)
+    return false;
+
+  Value *Shuf =
+      Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0), Mask);
+  Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
+
+  // Intersect flags from the old casts.
+  if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
+    NewInst->copyIRFlags(C0);
+    NewInst->andIRFlags(C1);
+  }
+
+  replaceValue(I, *Cast);
+  return true;
+}
+
 /// Given a commutative reduction, the order of the input lanes does not alter
 /// the results. We can use this to remove certain shuffles feeding the
 /// reduction, removing the need to shuffle at all.
@@ -1986,6 +2056,7 @@ bool VectorCombine::run() {
         break;
       case Instruction::ShuffleVector:
         MadeChange |= foldShuffleOfBinops(I);
+        MadeChange |= foldShuffleOfCastops(I);
         MadeChange |= foldSelectShuffle(I);
         break;
       case Instruction::BitCast:
diff --git a/llvm/lib/WindowsDriver/MSVCPaths.cpp b/llvm/lib/WindowsDriver/MSVCPaths.cpp
index 634cfcb15f1d..a7bffbb20eba 100644
--- a/llvm/lib/WindowsDriver/MSVCPaths.cpp
+++ b/llvm/lib/WindowsDriver/MSVCPaths.cpp
@@ -268,6 +268,7 @@ const char *archToWindowsSDKArch(Triple::ArchType Arch) {
   case Triple::ArchType::x86_64:
     return "x64";
   case Triple::ArchType::arm:
+  case Triple::ArchType::thumb:
     return "arm";
   case Triple::ArchType::aarch64:
     return "arm64";
@@ -285,6 +286,7 @@ const char *archToLegacyVCArch(Triple::ArchType Arch) {
   case Triple::ArchType::x86_64:
     return "amd64";
   case Triple::ArchType::arm:
+  case Triple::ArchType::thumb:
     return "arm";
   case Triple::ArchType::aarch64:
     return "arm64";
@@ -300,6 +302,7 @@ const char *archToDevDivInternalArch(Triple::ArchType Arch) {
   case Triple::ArchType::x86_64:
     return "amd64";
   case Triple::ArchType::arm:
+  case Triple::ArchType::thumb:
     return "arm";
   case Triple::ArchType::aarch64:
     return "arm64";
@@ -321,6 +324,7 @@ bool appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath,
       sys::path::append(LibPath, "x64");
       break;
     case Triple::arm:
+    case Triple::thumb:
       // It is not necessary to link against Windows SDK 7.x when targeting ARM.
       return false;
     default:
diff --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
index 64ed9bed13f3..47487d6adf68 100644
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -35,7 +35,7 @@ define i32 @add(i32 %arg) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B = sext <4 x i1> undef to <4 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = zext <8 x i1> undef to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %E = sext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %E = sext <8 x i1> undef to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F = trunc <8 x i32> undef to <8 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1
@@ -143,7 +143,7 @@ define i32 @zext_sext(<8 x i1> %in) {
 ;
 ; AVX1-LABEL: 'zext_sext'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <8 x i1> %in to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = zext <16 x i8> undef to <16 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = sext <16 x i8> undef to <16 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A = sext <8 x i16> undef to <8 x i32>
@@ -343,7 +343,7 @@ define i32 @masks8(<8 x i1> %in) {
 ;
 ; AVX1-LABEL: 'masks8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <8 x i1> %in to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'masks8'
@@ -374,7 +374,7 @@ define i32 @masks4(<4 x i1> %in) {
 ;
 ; AVX1-LABEL: 'masks4'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <4 x i1> %in to <4 x i64>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %S = sext <4 x i1> %in to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <4 x i1> %in to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'masks4'
diff --git a/llvm/test/Analysis/CostModel/X86/extend.ll b/llvm/test/Analysis/CostModel/X86/extend.ll
index 01efced3c13b..4a2585a9ddf9 100644
--- a/llvm/test/Analysis/CostModel/X86/extend.ll
+++ b/llvm/test/Analysis/CostModel/X86/extend.ll
@@ -1962,7 +1962,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-LABEL: 'sext_vXi1'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i64 = sext <16 x i1> undef to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i64 = sext <32 x i1> undef to <32 x i64>
@@ -1971,7 +1971,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = sext <32 x i1> undef to <32 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64i32 = sext <64 x i1> undef to <64 x i32>
@@ -2242,7 +2242,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-LABEL: 'sext_vXi1'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i64 = sext <16 x i1> undef to <16 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i64 = sext <32 x i1> undef to <32 x i64>
@@ -2251,7 +2251,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = sext <32 x i1> undef to <32 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64i32 = sext <64 x i1> undef to <64 x i32>
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
new file mode 100644
index 000000000000..55fdaafc24ed
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index 897344d622d0..ad56c28510f5 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX2
-;
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,SKL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,KNL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,SKX
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
 define i32 @masked_load() {
 ; SSE2-LABEL: 'masked_load'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 5f22b2e39f94..c7e7c46482c4 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX2
-;
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,SKL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,KNL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,SKX
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
 define i32 @masked_load() {
 ; SSE2-LABEL: 'masked_load'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
new file mode 100644
index 000000000000..edb05ad71a40
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
new file mode 100644
index 000000000000..3ebd9cc41c09
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir
deleted file mode 100644
index b145a6d3fd39..000000000000
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir
+++ /dev/null
@@ -1,28 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner %s -o - \
-# RUN:     --aarch64prelegalizercombiner-disable-rule=constant_fold_binop | FileCheck %s
-
-# `constant_fold_binop` is disabled to trigger the infinite loop in `commute_int_constant_to_rhs`.
-
----
-name:            add
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $s0
-
-    ; CHECK-LABEL: name: add
-    ; CHECK: liveins: $s0
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %c0:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %c1:_(s32) = G_CONSTANT i32 2
-    ; CHECK-NEXT: %add:_(s32) = G_ADD %c0, %c1
-    ; CHECK-NEXT: $s0 = COPY %add(s32)
-    ; CHECK-NEXT: RET_ReallyLR
-    %c0:_(s32) = G_CONSTANT i32 1
-    %c1:_(s32) = G_CONSTANT i32 2
-    %add:_(s32) = G_ADD %c0, %c1
-    $s0 = COPY %add
-    RET_ReallyLR
-
-...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
index 499c08fa4966..7921de6ce236 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -15,7 +15,7 @@
   define void @mul_wrong_pow_2(ptr %addr) { ret void }
   define void @more_than_one_use_shl_1(ptr %addr) { ret void }
   define void @more_than_one_use_shl_2(ptr %addr) { ret void }
-  define void @more_than_one_use_shl_lsl_fast(ptr %addr) #1 { ret void }
+  define void @more_than_one_use_shl_lsl_fast(ptr %addr) { ret void }
   define void @more_than_one_use_shl_lsl_slow(ptr %addr) { ret void }
   define void @more_than_one_use_shl_minsize(ptr %addr) #0 { ret void }
   define void @ldrwrox(ptr %addr) { ret void }
@@ -24,7 +24,6 @@
   define void @ldbbrox(ptr %addr) { ret void }
   define void @ldrqrox(ptr %addr) { ret void }
   attributes #0 = { optsize }
-  attributes #1 = { "target-features"="+addr-lsl-fast" }
 ...
 
 ---
@@ -478,11 +477,10 @@ body:             |
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK-NEXT: [[ADDXrs:%[0-9]+]]:gpr64common = ADDXrs [[COPY1]], [[COPY]], 3
-    ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK-NEXT: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]]
     ; CHECK-NEXT: $x2 = COPY [[ADDXrr]]
     ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 59cd87f58ab0..022aaea9ef0c 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-slow-14 | FileCheck %s --check-prefixes=CHECK,CHECK0
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK3
 
 %struct.a = type [256 x i16]
 %struct.b = type [256 x i32]
@@ -49,36 +49,20 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind {
 }
 
 define i32 @word(ptr %ctx, i32 %xor72) nounwind {
-; CHECK0-LABEL: word:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK0-NEXT:    ubfx x8, x1, #9, #8
-; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    lsl x21, x8, #2
-; CHECK0-NEXT:    ldr w20, [x0, x21]
-; CHECK0-NEXT:    bl foo
-; CHECK0-NEXT:    mov w0, w20
-; CHECK0-NEXT:    str w20, [x19, x21]
-; CHECK0-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: word:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK3-NEXT:    ubfx x21, x1, #9, #8
-; CHECK3-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK3-NEXT:    mov x19, x0
-; CHECK3-NEXT:    ldr w20, [x0, x21, lsl #2]
-; CHECK3-NEXT:    bl foo
-; CHECK3-NEXT:    mov w0, w20
-; CHECK3-NEXT:    str w20, [x19, x21, lsl #2]
-; CHECK3-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK3-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: word:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ubfx x21, x1, #9, #8
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    ldr w20, [x0, x21, lsl #2]
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:    str w20, [x19, x21, lsl #2]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
   %idxprom83 = and i64 %conv82, 255
@@ -90,36 +74,20 @@ define i32 @word(ptr %ctx, i32 %xor72) nounwind {
 }
 
 define i64 @doubleword(ptr %ctx, i32 %xor72) nounwind {
-; CHECK0-LABEL: doubleword:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK0-NEXT:    ubfx x8, x1, #9, #8
-; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    lsl x21, x8, #3
-; CHECK0-NEXT:    ldr x20, [x0, x21]
-; CHECK0-NEXT:    bl foo
-; CHECK0-NEXT:    mov x0, x20
-; CHECK0-NEXT:    str x20, [x19, x21]
-; CHECK0-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: doubleword:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK3-NEXT:    ubfx x21, x1, #9, #8
-; CHECK3-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK3-NEXT:    mov x19, x0
-; CHECK3-NEXT:    ldr x20, [x0, x21, lsl #3]
-; CHECK3-NEXT:    bl foo
-; CHECK3-NEXT:    mov x0, x20
-; CHECK3-NEXT:    str x20, [x19, x21, lsl #3]
-; CHECK3-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK3-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: doubleword:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ubfx x21, x1, #9, #8
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    ldr x20, [x0, x21, lsl #3]
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    str x20, [x19, x21, lsl #3]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
   %idxprom83 = and i64 %conv82, 255
@@ -163,20 +131,12 @@ endbb:
 }
 
 define i64 @gep3(ptr %p, i64 %b) {
-; CHECK0-LABEL: gep3:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    lsl x9, x1, #3
-; CHECK0-NEXT:    mov x8, x0
-; CHECK0-NEXT:    ldr x0, [x0, x9]
-; CHECK0-NEXT:    str x1, [x8, x9]
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: gep3:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    mov x8, x0
-; CHECK3-NEXT:    ldr x0, [x0, x1, lsl #3]
-; CHECK3-NEXT:    str x1, [x8, x1, lsl #3]
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: gep3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldr x0, [x0, x1, lsl #3]
+; CHECK-NEXT:    str x1, [x8, x1, lsl #3]
+; CHECK-NEXT:    ret
   %g = getelementptr inbounds i64, ptr %p, i64 %b
   %l = load i64, ptr %g
   store i64 %b, ptr %g
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
index 573f921e638c..e31c9a072dc4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
@@ -134,9 +134,8 @@ define void @test8(i64 %a, ptr noalias %src, ptr noalias %dst, i64 %n) {
 ; CHECK-NEXT:    b.hs .LBB7_1
 ; CHECK-NEXT:  // %bb.3: // %if.then
 ; CHECK-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; CHECK-NEXT:    lsl x10, x8, #3
-; CHECK-NEXT:    ldr x11, [x1, x10]
-; CHECK-NEXT:    str x11, [x2, x10]
+; CHECK-NEXT:    ldr x10, [x1, x8, lsl #3]
+; CHECK-NEXT:    str x10, [x2, x8, lsl #3]
 ; CHECK-NEXT:    b .LBB7_1
 ; CHECK-NEXT:  .LBB7_4: // %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
index d593272be1aa..6bcd2f04849b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -125,7 +125,7 @@ return:                                           ; preds = %if.end23, %if.then3
 }
 
 ; CHECK: @test
-; CHECK-NOT: , uxtw #2]
+; CHECK: , uxtw #2]
 define i32 @test(ptr %array, i8 zeroext %c, i32 %arg) {
 entry:
   %conv = zext i8 %c to i32
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
index 3542b26b5353..5b055a4eb37a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -201,11 +201,10 @@ define void @fct1_64x1(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_64x1:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray64x1
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray64x1]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <1 x i64>, ptr %array, i64 %offset
@@ -238,11 +237,10 @@ define void @fct1_32x2(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_32x2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray32x2
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray32x2]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <2 x i32>, ptr %array, i64 %offset
@@ -275,11 +273,10 @@ define void @fct1_16x4(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_16x4:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray16x4
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray16x4]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <4 x i16>, ptr %array, i64 %offset
@@ -312,11 +309,10 @@ define void @fct1_8x8(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_8x8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray8x8
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray8x8]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <8 x i8>, ptr %array, i64 %offset
diff --git a/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll b/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
index 8f195531927e..634d1b90ff90 100644
--- a/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
+++ b/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
@@ -82,13 +82,12 @@ define void @avoid_promotion_2_and(ptr nocapture noundef %arg) {
 ; CHECK-NEXT:    eor w10, w10, w11
 ; CHECK-NEXT:    ldur w11, [x8, #-24]
 ; CHECK-NEXT:    and w10, w10, w14
-; CHECK-NEXT:    ldp x15, x14, [x8, #-16]
-; CHECK-NEXT:    ubfiz x13, x10, #1, #32
+; CHECK-NEXT:    ldp x14, x13, [x8, #-16]
 ; CHECK-NEXT:    str w10, [x8]
-; CHECK-NEXT:    and w10, w11, w12
-; CHECK-NEXT:    ldrh w11, [x14, x13]
-; CHECK-NEXT:    strh w11, [x15, w10, uxtw #1]
-; CHECK-NEXT:    strh w12, [x14, x13]
+; CHECK-NEXT:    and w11, w11, w12
+; CHECK-NEXT:    ldrh w15, [x13, w10, uxtw #1]
+; CHECK-NEXT:    strh w15, [x14, w11, uxtw #1]
+; CHECK-NEXT:    strh w12, [x13, w10, uxtw #1]
 ; CHECK-NEXT:    b LBB1_1
 ; CHECK-NEXT:  LBB1_4: ; %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
index b5c21044266c..50c70c5676c4 100644
--- a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
+++ b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux"
 define void @f0(ptr %a, i64 %n) {
 ; CHECK-LABEL: f0:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
@@ -15,7 +15,6 @@ define void @f0(ptr %a, i64 %n) {
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
 ; CHECK-NEXT:    .cfi_offset w30, -48
 ; CHECK-NEXT:    mov x21, #1 // =0x1
 ; CHECK-NEXT:    mov x19, x1
@@ -27,18 +26,17 @@ define void @f0(ptr %a, i64 %n) {
 ; CHECK-NEXT:    b.ge .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: // %loop.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsl x23, x22, #2
+; CHECK-NEXT:    ldr w0, [x20, x22, lsl #2]
 ; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    ldr w0, [x20, x23]
 ; CHECK-NEXT:    bl g
-; CHECK-NEXT:    str w0, [x20, x23]
+; CHECK-NEXT:    str w0, [x20, x22, lsl #2]
 ; CHECK-NEXT:    add x22, x22, #1
 ; CHECK-NEXT:    cmp x22, x19
 ; CHECK-NEXT:    b.lt .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: // %exit
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   br label %loop
@@ -64,15 +62,13 @@ exit:
 define void @f1(ptr %a, i64 %n) {
 ; CHECK-LABEL: f1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    mov x19, x1
 ; CHECK-NEXT:    mov x20, x0
 ; CHECK-NEXT:    mov x21, xzr
@@ -80,19 +76,17 @@ define void @f1(ptr %a, i64 %n) {
 ; CHECK-NEXT:    b.ge .LBB1_2
 ; CHECK-NEXT:  .LBB1_1: // %loop.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsl x22, x21, #2
+; CHECK-NEXT:    ldr w0, [x20, x21, lsl #2]
 ; CHECK-NEXT:    mov x1, #1450704896 // =0x56780000
 ; CHECK-NEXT:    movk x1, #4660, lsl #48
-; CHECK-NEXT:    ldr w0, [x20, x22]
 ; CHECK-NEXT:    bl g
-; CHECK-NEXT:    str w0, [x20, x22]
+; CHECK-NEXT:    str w0, [x20, x21, lsl #2]
 ; CHECK-NEXT:    add x21, x21, #1
 ; CHECK-NEXT:    cmp x21, x19
 ; CHECK-NEXT:    b.lt .LBB1_1
 ; CHECK-NEXT:  .LBB1_2: // %exit
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
index d4ea143a3d84..b87157a18383 100644
--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
@@ -972,10 +972,9 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x1]
 ; CHECK-NEXT:    ubfx x8, x8, #21, #10
-; CHECK-NEXT:    lsl x8, x8, #2
-; CHECK-NEXT:    ldr w9, [x0, x8]
+; CHECK-NEXT:    ldr w9, [x0, x8, lsl #2]
 ; CHECK-NEXT:    add w9, w9, #1
-; CHECK-NEXT:    str w9, [x0, x8]
+; CHECK-NEXT:    str w9, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ret
   %tmp = load i64, ptr %a1, align 8
   %tmp1 = lshr i64 %tmp, 21
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index 30123a31cebb..e8dafd5e8fba 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -223,10 +223,9 @@ define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
 ; CHECK-NEXT:    // Parent Loop BB3_1 Depth=1
 ; CHECK-NEXT:    // => This Loop Header: Depth=2
 ; CHECK-NEXT:    // Child Loop BB3_3 Depth 3
-; CHECK-NEXT:    lsl x12, x11, #3
+; CHECK-NEXT:    ldr x13, [x1, x11, lsl #3]
+; CHECK-NEXT:    ldr x12, [x10, x11, lsl #3]
 ; CHECK-NEXT:    mov x14, x4
-; CHECK-NEXT:    ldr x13, [x1, x12]
-; CHECK-NEXT:    ldr x12, [x10, x12]
 ; CHECK-NEXT:    ldr w13, [x13]
 ; CHECK-NEXT:  .LBB3_3: // %for.body8
 ; CHECK-NEXT:    // Parent Loop BB3_1 Depth=1
diff --git a/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll b/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll
new file mode 100644
index 000000000000..728cffeba02a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll
@@ -0,0 +1,50 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- ok.ll
+
+; RUN: llc -mtriple=aarch64-linux ok.ll               -o - | \
+; RUN:   FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=aarch64-linux ok.ll -filetype=obj -o - |  \
+; RUN:   llvm-readelf --notes - | FileCheck %s --check-prefix=OBJ
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 85}
+
+; ASM: .section .note.gnu.property,"a",@note
+; ASM-NEXT: .p2align 3, 0x0
+; ASM-NEXT: .word 4
+; ASM-NEXT: .word 24
+; ASM-NEXT: .word 5
+; ASM-NEXT: .asciz "GNU"
+; 3221225473 = 0xc0000001 = GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+; ASM-NEXT: .word 3221225473
+; ASM-NEXT: .word 16
+; ASM-NEXT: .xword 268435458
+; ASM-NEXT: .xword 85
+
+; OBJ: Displaying notes found in: .note.gnu.property
+; OBJ-NEXT:   Owner                 Data size	Description
+; OBJ-NEXT:   GNU                   0x00000018	NT_GNU_PROPERTY_TYPE_0 (property note)
+; OBJ-NEXT:   AArch64 PAuth ABI core info: platform 0x10000002 (llvm_linux), version 0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)
+
+; ERR: either both or no 'aarch64-elf-pauthabi-platform' and 'aarch64-elf-pauthabi-version' module flags must be present
+
+;--- err1.ll
+
+; RUN: not llc -mtriple=aarch64-linux err1.ll 2>&1 -o - | \
+; RUN:   FileCheck %s --check-prefix=ERR
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 2}
+
+;--- err2.ll
+
+; RUN: not llc -mtriple=aarch64-linux err2.ll 2>&1 -o - | \
+; RUN:   FileCheck %s --check-prefix=ERR
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 31}
diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
index 52007221e12a..f65a08ae2ace 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
@@ -100,7 +100,7 @@ exit:
 }
 
 ; Address calculation cheap enough on some cores.
-define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind  "target-features"="+alu-lsl-fast,+addr-lsl-fast" {
+define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind  "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: f3:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    tbz w0, #0, .LBB3_2
@@ -130,7 +130,7 @@ exit:
   ret i32 %v
 }
 
-define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast,+addr-lsl-fast" {
+define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: f4:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    cmp x1, #1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index c25b0f212826..78d908455e01 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -16,7 +16,6 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB0_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -44,7 +43,6 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB1_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -74,7 +72,6 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB2_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = trunc i32 %value to i1
@@ -106,7 +103,6 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB3_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %value = load i32, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
index 303dc46e2c88..5c22d5bdcf74 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
index 63702d258757..e005c38355a3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 352adacbda3e..af6f69130910 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -65,11 +65,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB0_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -92,11 +92,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB0_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -253,8 +253,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -504,11 +504,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB2_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -544,11 +544,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB2_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_add_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -944,7 +944,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -952,6 +951,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -974,7 +974,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB4_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1006,7 +1005,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB4_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1219,11 +1217,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB5_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
@@ -1258,11 +1256,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB5_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1530,10 +1528,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -1557,12 +1555,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB7_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1585,12 +1583,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB7_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1751,8 +1749,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2006,11 +2004,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB9_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2046,11 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB9_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2446,7 +2444,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -2454,6 +2451,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -2477,7 +2475,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB11_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
@@ -2487,6 +2484,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2509,7 +2507,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB11_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
@@ -2519,6 +2516,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3081,11 +3079,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB14_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3121,11 +3119,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB14_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3355,11 +3353,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB15_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_or_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3395,11 +3393,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB15_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_or_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3629,11 +3627,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB16_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3669,11 +3667,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB16_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3903,11 +3901,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB17_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_max_i32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3943,11 +3941,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB17_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_max_i32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4151,7 +4149,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
@@ -4162,6 +4159,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -4182,7 +4180,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB18_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4216,7 +4213,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB18_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4419,11 +4415,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB19_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_min_i32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -4459,11 +4455,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB19_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_min_i32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4667,7 +4663,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
@@ -4678,6 +4673,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -4698,7 +4694,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB20_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4732,7 +4727,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB20_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4935,11 +4929,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB21_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_max_u32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -4975,11 +4969,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB21_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_max_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5183,7 +5177,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
@@ -5193,6 +5186,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -5214,7 +5208,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB22_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -5226,6 +5219,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5246,7 +5240,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB22_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -5258,6 +5251,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5446,11 +5440,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB23_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_min_u32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5486,11 +5480,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB23_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_min_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5694,7 +5688,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5704,6 +5697,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -5725,7 +5719,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB24_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5737,6 +5730,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5757,7 +5751,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB24_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5769,6 +5762,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 19a1d2d9dbd1..c9076a9541b2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -186,7 +186,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:  .LBB1_8: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 98658834e897..bf4302c156d8 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -5678,22 +5678,18 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    scratch_store_b128 off, v[18:21], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[10:13], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[6:9], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[2:5], s0
-; GFX11-NEXT:    scratch_store_b16 off, v1, s0 offset:128
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    scratch_store_b128 v0, v[22:25], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[18:21], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[14:17], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[10:13], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[6:9], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[2:5], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[30:33], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[26:29], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[22:25], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[14:17], s0
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b128 v0, v[30:33], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[26:29], off offset:96
+; GFX11-NEXT:    scratch_store_b16 v0, v1, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
   %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
@@ -8827,19 +8823,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_u16 v32, v[1:2], off offset:54
 ; GFX11-NEXT:    global_load_u16 v33, v[1:2], off offset:58
 ; GFX11-NEXT:    global_load_u16 v1, v[1:2], off offset:62
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s9, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s10, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s11, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(30)
@@ -8936,23 +8919,23 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[3:4], v2
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[1:2], v37
-; GFX11-NEXT:    scratch_store_b128 off, v[96:99], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[84:87], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[80:83], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[68:71], s4
-; GFX11-NEXT:    scratch_store_b128 off, v[64:67], s5
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s6
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s7
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s9
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s10
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s11
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_store_b128 v0, v[96:99], off offset:240
+; GFX11-NEXT:    scratch_store_b128 v0, v[84:87], off offset:224
+; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off offset:208
+; GFX11-NEXT:    scratch_store_b128 v0, v[68:71], off offset:192
+; GFX11-NEXT:    scratch_store_b128 v0, v[64:67], off offset:176
+; GFX11-NEXT:    scratch_store_b128 v0, v[52:55], off offset:160
+; GFX11-NEXT:    scratch_store_b128 v0, v[48:51], off offset:144
+; GFX11-NEXT:    scratch_store_b128 v0, v[33:36], off offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <32 x bfloat> %load to <32 x double>
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index ac50fb86c96f..da609bfa8ede 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add
 ; GCN-NEXT:  .LBB0_2: ; %endif
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x3d0000
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_store_dword v1, v0, s[0:1] offset:2300
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 069c57e2ae63..6dabd8c0b83e 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -103,7 +103,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB0_4: ; %exit
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
@@ -131,7 +130,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB0_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -266,7 +264,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB1_4: ; %exit
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
@@ -294,7 +291,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB1_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -431,7 +427,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:  .LBB2_4: ; %exit
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3800
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -461,7 +456,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB2_4: ; %exit
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -665,7 +659,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB3_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -871,7 +864,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB4_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1081,7 +1073,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB5_4: ; %exit
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -1432,7 +1423,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB7_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
@@ -1724,7 +1714,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB8_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index db89ad66ffab..3b2f15c8340a 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -114,7 +114,6 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:  .LBB3_2: ; %bb2
 ; CIGFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_arg_i1_use:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index acadee279817..401cbce00ac9 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1561,34 +1561,28 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:96
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <33 x i32>, ptr addrspace(1) %ptr
@@ -1850,34 +1844,28 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:96
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
@@ -2143,33 +2131,24 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144
 ; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index c1d682689903..3b078c41f4a8 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1989,256 +1989,138 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x7
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:1024
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:512
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:256
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x790
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x780
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x770
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x760
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x750
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x740
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x730
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x720
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x710
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x700
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x690
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x680
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x670
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x660
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x650
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x640
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x630
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x620
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x610
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x600
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x590
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x580
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x570
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x560
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x550
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x540
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x530
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x520
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x510
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x500
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x490
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x480
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x470
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x460
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x450
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x440
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x430
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x420
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x410
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3f0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3d0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3b0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x390
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x380
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x370
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x360
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x350
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x340
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x330
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x320
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x310
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x300
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2f0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2d0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2b0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x290
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x280
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x270
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x260
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x250
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x240
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x230
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x220
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x210
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x190
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x180
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x170
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x160
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x150
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x140
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x130
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x120
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xf0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xd0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xb0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x90
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2032
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2016
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2000
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1984
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1968
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1952
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1936
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1920
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1904
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1888
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1872
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1856
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1840
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1824
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1808
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1792
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1776
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1760
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1744
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1728
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1712
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1696
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1680
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1664
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1648
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1632
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1616
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1600
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1584
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1568
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1552
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1536
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1520
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1504
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1488
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1472
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1456
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1440
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1424
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1408
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1392
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1376
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1360
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1344
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1328
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1312
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1296
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1280
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1264
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1248
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1232
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1216
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1200
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1184
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1168
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1152
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1136
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1120
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1104
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1088
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1072
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1056
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1040
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1024
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1008
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:992
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:976
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:960
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:944
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:928
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:912
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:896
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:880
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:864
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:848
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:832
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:816
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:800
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:784
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:768
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:752
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:736
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:720
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:704
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:688
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:672
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:656
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:640
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:624
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:608
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:592
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:576
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:560
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:544
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:528
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:512
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:496
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:480
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:464
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:448
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:432
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:416
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:400
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:384
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:368
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:352
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:336
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:320
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:304
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:288
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:272
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:256
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:224
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:208
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:192
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:176
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:160
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:144
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <512 x i32> zeroinitializer
@@ -2636,7 +2518,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:212
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:208
@@ -2651,93 +2532,82 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:168
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:164
-; GFX11-NEXT:    s_clause 0x14
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:120
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
+; GFX11-NEXT:    s_clause 0x11
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v23, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v22, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v21, off, s32 offset:104
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:136
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
+; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v11, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v10, off, s32 offset:152
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
+; GFX11-NEXT:    scratch_load_b32 v15, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_b32 v14, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_b32 v13, off, s32 offset:136
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-NEXT:    s_clause 0xd
-; GFX11-NEXT:    scratch_load_b32 v8, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v7, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v6, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v5, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v9, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-NEXT:    scratch_load_b32 v16, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x100
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s34, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s35, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s36, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s37, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s38, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s39, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s40, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s41, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s42, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s43, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[60:63], off offset:272
 ; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[12:15], off offset:256
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[16:19], off offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[60:63], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[20:23], off offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s34
+; GFX11-NEXT:    scratch_store_b128 v0, v[56:59], off offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s35
+; GFX11-NEXT:    scratch_store_b128 v0, v[41:44], off offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[37:40], s36
+; GFX11-NEXT:    scratch_store_b128 v0, v[37:40], off offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s37
+; GFX11-NEXT:    scratch_store_b128 v0, v[52:55], off offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s38
+; GFX11-NEXT:    scratch_store_b128 v0, v[48:51], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s39
+; GFX11-NEXT:    scratch_store_b128 v0, v[33:36], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s40
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s41
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s42
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s43
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
@@ -3306,7 +3176,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-LABEL: call_72xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s46, s33
+; GFX11-NEXT:    s_mov_b32 s34, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
@@ -3353,11 +3223,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    s_add_i32 s0, s32, 32
 ; GFX11-NEXT:    s_add_i32 s1, s32, 16
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x200
+; GFX11-NEXT:    v_writelane_b32 v60, s30, 0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
-; GFX11-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0
@@ -3373,14 +3243,14 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
-; GFX11-NEXT:    s_mov_b32 s45, return_72xi32@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s44, return_72xi32@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s1, return_72xi32@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, return_72xi32@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v60, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b128 v[45:48], off, s33 offset:624
 ; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:640
-; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    s_add_i32 s2, s32, 0xa0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v32, v48
 ; GFX11-NEXT:    s_clause 0x9
@@ -3431,38 +3301,38 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9
 ; GFX11-NEXT:    v_mov_b32_e32 v9, v20
-; GFX11-NEXT:    scratch_store_b32 off, v11, s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
+; GFX11-NEXT:    scratch_store_b32 off, v11, s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x90
 ; GFX11-NEXT:    v_mov_b32_e32 v11, v22
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x80
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v16
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 24
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x70
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v17
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s2
 ; GFX11-NEXT:    v_mov_b32_e32 v13, v24
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x6c
 ; GFX11-NEXT:    v_mov_b32_e32 v7, v18
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
+; GFX11-NEXT:    scratch_store_b32 off, v0, s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x60
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26
-; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
+; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x50
 ; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
-; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 64
 ; GFX11-NEXT:    v_mov_b32_e32 v14, v25
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 48
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 48
 ; GFX11-NEXT:    v_mov_b32_e32 v16, v27
-; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 32
+; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 32
 ; GFX11-NEXT:    v_mov_b32_e32 v30, v46
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
-; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 16
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s2
 ; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 42
@@ -3470,10 +3340,10 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1572
 ; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1556
 ; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1540
-; GFX11-NEXT:    s_add_i32 s0, s33, 0x400
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0xb
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:4
@@ -3493,7 +3363,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
-; GFX11-NEXT:    s_mov_b32 s33, s46
+; GFX11-NEXT:    s_mov_b32 s33, s34
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 433a836e7ca0..3b3e107a6296 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -33,7 +33,7 @@ define void @func_use_lds_global() {
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -103,7 +103,7 @@ define void @func_use_lds_global_constexpr_cast() {
 ; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_trap 2
@@ -171,7 +171,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX8-SDAG-NEXT:  ; %bb.1: ; %bb1
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -181,7 +181,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX8-SDAG-NEXT:  ; %bb.3: ; %bb0
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -189,7 +189,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:  .LBB2_4: ; %ret
 ; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -379,7 +379,7 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v2
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -472,7 +472,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:  ; %bb.1: ; %use.bb
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -481,7 +481,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:  .LBB4_2: ; %ret
 ; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: func_uses_lds_phi_after:
@@ -506,7 +505,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-GISEL-NEXT:  .LBB4_2: ; %ret
 ; GFX8-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: func_uses_lds_phi_after:
@@ -527,7 +526,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB4_2: ; %ret
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: func_uses_lds_phi_after:
@@ -548,7 +547,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:  .LBB4_2: ; %ret
 ; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-LABEL: func_uses_lds_phi_after:
@@ -570,7 +569,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:  .LBB4_3: ; %ret
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ; SDAG-NEXT:  .LBB4_4:
 ; SDAG-NEXT:    s_endpgm
@@ -594,7 +593,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:  .LBB4_3: ; %ret
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB4_4:
 ; GISEL-NEXT:    s_endpgm
@@ -616,6 +615,3 @@ ret:
 ; CHECK: {{.*}}
 ; GFX8: {{.*}}
 ; GFX9: {{.*}}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 5e76dfd9bddd..4477f028c6d2 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -157,7 +157,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:  .LBB2_2:
 ; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; VI-NEXT:    s_mov_b64 s[6:7], exec
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_readfirstlane_b32 s8, v1
 ; VI-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
 ; VI-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -203,15 +202,14 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:  ; %bb.7:
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_add_rtn_f32 v2, v2, v1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:  .LBB2_8:
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_readfirstlane_b32 s2, v2
 ; VI-NEXT:    v_add_f32_e32 v2, s2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -240,7 +238,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX9-NEXT:  .LBB2_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -285,16 +282,15 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_8
 ; GFX9-NEXT:  ; %bb.7:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_rtn_f32 v2, v2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB2_8:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 138dd53b3ede..d19ef75cb08c 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1260,8 +1260,6 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB11_5: ; %end
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB11_6:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
@@ -1525,8 +1523,6 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB13_5: ; %UnifiedReturnBlock
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB13_6:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index eef5f57beb07..ecebbb9ac874 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -32,7 +32,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT:   S_WAITCNT 3952
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
 entry:
@@ -79,7 +79,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a,
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT:   S_WAITCNT 3952
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.5:
 entry:
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index bafa92e06834..65d0768c6088 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -18,14 +18,12 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV32-NEXT:    vand.vv v8, v11, v8
+; RV32-NEXT:    vmerge.vvm v8, v8, v11, v0
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctz_nxv4i32:
@@ -41,14 +39,12 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64-NEXT:    vand.vv v8, v11, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v11, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    sub a0, a0, a1
-; RV64-NEXT:    lui a1, 16
-; RV64-NEXT:    addiw a1, a1, -1
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32> %a, i1 0)
   ret i32 %res
@@ -158,8 +154,7 @@ define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
 ; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vmadd.vx v16, a1, v8
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64-NEXT:    vand.vv v8, v16, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    subw a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
index f5305a1c36de..83d1d1b3f94c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
@@ -19,10 +19,9 @@ define <4 x i64> @vwsll_vv_v4i64_sext(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %y = sext <4 x i32> %b to <4 x i64>
@@ -41,10 +40,9 @@ define <4 x i64> @vwsll_vv_v4i64_zext(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %y = zext <4 x i32> %b to <4 x i64>
@@ -62,9 +60,9 @@ define <4 x i64> @vwsll_vx_i64_v4i64(<4 x i32> %a, i64 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <4 x i64> %head, <4 x i64> poison, <4 x i32> zeroinitializer
@@ -88,10 +86,8 @@ define <4 x i64> @vwsll_vx_i32_v4i64_sext(<4 x i32> %a, i32 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -116,10 +112,8 @@ define <4 x i64> @vwsll_vx_i32_v4i64_zext(<4 x i32> %a, i32 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -142,12 +136,9 @@ define <4 x i64> @vwsll_vx_i16_v4i64_sext(<4 x i32> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -170,12 +161,9 @@ define <4 x i64> @vwsll_vx_i16_v4i64_zext(<4 x i32> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -198,12 +186,9 @@ define <4 x i64> @vwsll_vx_i8_v4i64_sext(<4 x i32> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -226,12 +211,9 @@ define <4 x i64> @vwsll_vx_i8_v4i64_zext(<4 x i32> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -251,9 +233,9 @@ define <4 x i64> @vwsll_vi_v4i64(<4 x i32> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %z = shl <4 x i64> %x, splat (i64 2)
@@ -275,10 +257,9 @@ define <8 x i32> @vwsll_vv_v8i32_sext(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %y = sext <8 x i16> %b to <8 x i32>
@@ -297,10 +278,9 @@ define <8 x i32> @vwsll_vv_v8i32_zext(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %y = zext <8 x i16> %b to <8 x i32>
@@ -318,9 +298,9 @@ define <8 x i32> @vwsll_vx_i64_v8i32(<8 x i16> %a, i64 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <8 x i64> %head, <8 x i64> poison, <8 x i32> zeroinitializer
@@ -340,9 +320,9 @@ define <8 x i32> @vwsll_vx_i32_v8i32(<8 x i16> %a, i32 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <8 x i32> %head, <8 x i32> poison, <8 x i32> zeroinitializer
@@ -366,10 +346,8 @@ define <8 x i32> @vwsll_vx_i16_v8i32_sext(<8 x i16> %a, i16 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -394,10 +372,8 @@ define <8 x i32> @vwsll_vx_i16_v8i32_zext(<8 x i16> %a, i16 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -420,12 +396,9 @@ define <8 x i32> @vwsll_vx_i8_v8i32_sext(<8 x i16> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -448,12 +421,9 @@ define <8 x i32> @vwsll_vx_i8_v8i32_zext(<8 x i16> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -473,9 +443,9 @@ define <8 x i32> @vwsll_vi_v8i32(<8 x i16> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %z = shl <8 x i32> %x, splat (i32 2)
@@ -497,10 +467,9 @@ define <16 x i16> @vwsll_vv_v16i16_sext(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %y = sext <16 x i8> %b to <16 x i16>
@@ -519,10 +488,9 @@ define <16 x i16> @vwsll_vv_v16i16_zext(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %y = zext <16 x i8> %b to <16 x i16>
@@ -552,12 +520,9 @@ define <16 x i16> @vwsll_vx_i32_v16i16(<16 x i8> %a, i32 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v12, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v8
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <16 x i32> %head, <16 x i32> poison, <16 x i32> zeroinitializer
@@ -577,9 +542,9 @@ define <16 x i16> @vwsll_vx_i16_v16i16(<16 x i8> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <16 x i16> %head, <16 x i16> poison, <16 x i32> zeroinitializer
@@ -603,10 +568,8 @@ define <16 x i16> @vwsll_vx_i8_v16i16_sext(<16 x i8> %a, i8 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -631,10 +594,8 @@ define <16 x i16> @vwsll_vx_i8_v16i16_zext(<16 x i8> %a, i8 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -654,9 +615,9 @@ define <16 x i16> @vwsll_vi_v16i16(<16 x i8> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %z = shl <16 x i16> %x, splat (i16 2)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll
new file mode 100644
index 000000000000..3a8d08f306a5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+; The following binop x, (zext i1) tests will be vector-legalized into a vselect
+; of two splat_vectors, but on RV64 the splat value will be implicitly
+; truncated:
+;
+;       t15: nxv2i32 = splat_vector Constant:i64<1>
+;       t13: nxv2i32 = splat_vector Constant:i64<0>
+;     t16: nxv2i32 = vselect t2, t15, t13
+;   t7: nxv2i32 = add t4, t16
+;
+; Make sure that foldSelectWithIdentityConstant in DAGCombiner.cpp handles the
+; truncating splat, so we pull the vselect back and fold it into a mask.
+
+define <vscale x 2 x i32> @i1_zext_add(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %add = add <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %add
+}
+
+define <vscale x 2 x i32> @i1_zext_add_commuted(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_add_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %add = add <vscale x 2 x i32> %zext, %b
+  ret <vscale x 2 x i32> %add
+}
+
+define <vscale x 2 x i32> @i1_zext_sub(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_sub:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %sub = sub <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %sub
+}
+
+define <vscale x 2 x i32> @i1_zext_or(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_or:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vor.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %or = or <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %or
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index e56dca0732bb..a14ce7172615 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -149,49 +149,49 @@ define <vscale x 2 x i64> @vwop_vscale_sext_i32i64_multiple_users(ptr %x, ptr %y
 }
 
 define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) {
-; RV32-LABEL: vwop_vscale_sext_i1i32_multiple_users:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
-; RV32-NEXT:    vlm.v v8, (a0)
-; RV32-NEXT:    vlm.v v9, (a1)
-; RV32-NEXT:    vlm.v v10, (a2)
-; RV32-NEXT:    vmv.v.i v11, 0
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vmerge.vim v12, v11, -1, v0
-; RV32-NEXT:    vmv.v.v v0, v9
-; RV32-NEXT:    vmerge.vim v9, v11, -1, v0
-; RV32-NEXT:    vmv.v.v v0, v10
-; RV32-NEXT:    vmerge.vim v10, v11, -1, v0
-; RV32-NEXT:    vmul.vv v9, v12, v9
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsub.vv v11, v12, v10
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vsub.vx v10, v10, a0, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v10
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    ret
+; NO_FOLDING-LABEL: vwop_vscale_sext_i1i32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; NO_FOLDING-NEXT:    vlm.v v8, (a0)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
+; NO_FOLDING-NEXT:    vlm.v v10, (a2)
+; NO_FOLDING-NEXT:    vmv.v.i v11, 0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v9
+; NO_FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v10
+; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
+; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT:    li a0, 1
+; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
+; NO_FOLDING-NEXT:    ret
 ;
-; RV64-LABEL: vwop_vscale_sext_i1i32_multiple_users:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vlm.v v8, (a0)
-; RV64-NEXT:    vlm.v v9, (a1)
-; RV64-NEXT:    vlm.v v10, (a2)
-; RV64-NEXT:    vmv.v.i v11, 0
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v12, v11, -1, v0
-; RV64-NEXT:    vmv.v.v v0, v9
-; RV64-NEXT:    vmerge.vim v9, v11, -1, v0
-; RV64-NEXT:    vmv.v.v v0, v10
-; RV64-NEXT:    vmerge.vim v10, v11, -1, v0
-; RV64-NEXT:    vmul.vv v9, v12, v9
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v11, 1, v0
-; RV64-NEXT:    vsub.vv v8, v10, v8
-; RV64-NEXT:    vsub.vv v10, v12, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    ret
+; FOLDING-LABEL: vwop_vscale_sext_i1i32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; FOLDING-NEXT:    vlm.v v8, (a0)
+; FOLDING-NEXT:    vlm.v v9, (a1)
+; FOLDING-NEXT:    vlm.v v10, (a2)
+; FOLDING-NEXT:    vmv.v.i v11, 0
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v9
+; FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v10
+; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
+; FOLDING-NEXT:    vmul.vv v9, v12, v9
+; FOLDING-NEXT:    li a0, 1
+; FOLDING-NEXT:    vsub.vv v11, v12, v10
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v11
+; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
   %b2 = load <vscale x 2 x i1>, ptr %z
@@ -209,7 +209,7 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
 ; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v8, (a0)
 ; NO_FOLDING-NEXT:    vlm.v v9, (a1)
 ; NO_FOLDING-NEXT:    vlm.v v10, (a2)
@@ -221,17 +221,17 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v10
 ; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT:    li a0, 1
+; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v11, 1, v0
-; NO_FOLDING-NEXT:    vsub.vv v8, v10, v8
-; NO_FOLDING-NEXT:    vsub.vv v10, v12, v10
-; NO_FOLDING-NEXT:    vor.vv v8, v9, v8
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v10
+; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v8, (a0)
 ; FOLDING-NEXT:    vlm.v v9, (a1)
 ; FOLDING-NEXT:    vlm.v v10, (a2)
@@ -243,12 +243,12 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING-NEXT:    vmv1r.v v0, v10
 ; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; FOLDING-NEXT:    vmul.vv v9, v12, v9
+; FOLDING-NEXT:    li a0, 1
+; FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v11, 1, v0
-; FOLDING-NEXT:    vsub.vv v8, v10, v8
-; FOLDING-NEXT:    vsub.vv v10, v12, v10
-; FOLDING-NEXT:    vor.vv v8, v9, v8
-; FOLDING-NEXT:    vor.vv v8, v8, v10
+; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v11
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -444,41 +444,39 @@ define <vscale x 2 x i64> @vwop_vscale_zext_i32i64_multiple_users(ptr %x, ptr %y
 }
 
 define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) {
-; RV32-LABEL: vwop_vscale_zext_i1i32_multiple_users:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
-; RV32-NEXT:    vlm.v v0, (a0)
-; RV32-NEXT:    vlm.v v8, (a2)
-; RV32-NEXT:    vlm.v v9, (a1)
-; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    vmerge.vim v11, v10, 1, v0
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v10, 1, v0
-; RV32-NEXT:    vadd.vv v10, v11, v8
-; RV32-NEXT:    vsub.vv v8, v11, v8
-; RV32-NEXT:    vmv.v.v v0, v9
-; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    ret
+; NO_FOLDING-LABEL: vwop_vscale_zext_i1i32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; NO_FOLDING-NEXT:    vlm.v v0, (a0)
+; NO_FOLDING-NEXT:    vlm.v v8, (a2)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
+; NO_FOLDING-NEXT:    vmv.v.i v10, 0
+; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
+; NO_FOLDING-NEXT:    vmv.v.v v0, v9
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
+; NO_FOLDING-NEXT:    ret
 ;
-; RV64-LABEL: vwop_vscale_zext_i1i32_multiple_users:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vlm.v v0, (a0)
-; RV64-NEXT:    vlm.v v8, (a1)
-; RV64-NEXT:    vlm.v v9, (a2)
-; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    vmerge.vim v11, v10, 1, v0
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v10, 1, v0
-; RV64-NEXT:    vmv.v.v v0, v9
-; RV64-NEXT:    vmerge.vim v9, v10, 1, v0
-; RV64-NEXT:    vmul.vv v8, v11, v8
-; RV64-NEXT:    vadd.vv v10, v11, v9
-; RV64-NEXT:    vsub.vv v9, v11, v9
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    ret
+; FOLDING-LABEL: vwop_vscale_zext_i1i32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; FOLDING-NEXT:    vlm.v v0, (a0)
+; FOLDING-NEXT:    vlm.v v8, (a2)
+; FOLDING-NEXT:    vlm.v v9, (a1)
+; FOLDING-NEXT:    vmv.v.i v10, 0
+; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v11, v8
+; FOLDING-NEXT:    vsub.vv v8, v11, v8
+; FOLDING-NEXT:    vmv.v.v v0, v9
+; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v8
+; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
   %b2 = load <vscale x 2 x i1>, ptr %z
@@ -496,40 +494,36 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
 ; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a1)
-; NO_FOLDING-NEXT:    vlm.v v9, (a2)
+; NO_FOLDING-NEXT:    vlm.v v8, (a2)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
 ; NO_FOLDING-NEXT:    vmv.v.i v10, 0
 ; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
 ; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v9
-; NO_FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
-; NO_FOLDING-NEXT:    vmul.vv v8, v11, v8
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v9
-; NO_FOLDING-NEXT:    vsub.vv v9, v11, v9
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v10
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v9
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a1)
-; FOLDING-NEXT:    vlm.v v9, (a2)
+; FOLDING-NEXT:    vlm.v v8, (a2)
+; FOLDING-NEXT:    vlm.v v9, (a1)
 ; FOLDING-NEXT:    vmv.v.i v10, 0
 ; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
 ; FOLDING-NEXT:    vmv1r.v v0, v8
 ; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v11, v8
+; FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; FOLDING-NEXT:    vmv1r.v v0, v9
-; FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
-; FOLDING-NEXT:    vmul.vv v8, v11, v8
-; FOLDING-NEXT:    vadd.vv v10, v11, v9
-; FOLDING-NEXT:    vsub.vv v9, v11, v9
-; FOLDING-NEXT:    vor.vv v8, v8, v10
-; FOLDING-NEXT:    vor.vv v8, v8, v9
+; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v8
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -594,3 +588,6 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y,
 
 
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
index 770bb566c764..082de2e7bf77 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
@@ -627,3 +627,259 @@ define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a) {
   %z = shl <vscale x 8 x i16> %x, splat (i16 2)
   ret <vscale x 8 x i16> %z
 }
+
+; ==============================================================================
+; i8 -> i64
+; ==============================================================================
+
+define <vscale x 2 x i64> @vwsll_vv_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) {
+; CHECK-LABEL: vwsll_vv_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i8> %b to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vv_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) {
+; CHECK-LABEL: vwsll_vv_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i8> %b to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i64_nxv2i64_nxv2i8(<vscale x 2 x i8> %a, i64 %b) {
+; CHECK-LABEL: vwsll_vx_i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsll.vx v8, v10, a0
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i64_nxv2i64_nxv2i8:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %splat
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i32 %b) {
+; CHECK-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf2 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i32> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i32 %b) {
+; CHECK-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf2 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i32> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i16 %b) {
+; CHECK-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf4 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i16> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i16 %b) {
+; CHECK-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf4 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i16> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i8 %b) {
+; CHECK-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i8> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i8 %b) {
+; CHECK-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i8> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vi_nxv2i64_nxv2i8(<vscale x 2 x i8> %a) {
+; CHECK-LABEL: vwsll_vi_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vi_nxv2i64_nxv2i8:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, splat (i64 2)
+  ret <vscale x 2 x i64> %z
+}
diff --git a/llvm/test/CodeGen/SPARC/inlineasm-bad.ll b/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
index 5bf2adbeb75c..07eb67df6e5f 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
@@ -11,3 +11,12 @@ entry:
   tail call void asm sideeffect "faddq $0,$1,$2", "{f38},{f0},{f0}"(fp128 0xL0, fp128 0xL0, fp128 0xL0)
   ret void
 }
+
+; CHECK-label:test_twinword_error
+; CHECK: error: Hi part of pair should point to an even-numbered register
+; CHECK: error: (note that in some cases it might be necessary to manually bind the input/output registers instead of relying on automatic allocation)
+
+define i64 @test_twinword_error(){
+  %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i1}"()
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/SPARC/inlineasm.ll b/llvm/test/CodeGen/SPARC/inlineasm.ll
index ec27598e5e83..9817d7c6971f 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm.ll
@@ -143,3 +143,12 @@ entry:
   %1 = call double asm sideeffect "faddd $1, $2, $0", "=f,f,e"(i64 0, i64 0)
   ret void
 }
+
+; CHECK-label:test_twinword
+; CHECK: rd  %asr5, %i1
+; CHECK: srlx %i1, 32, %i0
+
+define i64 @test_twinword(){
+  %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i0}"()
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/SPIRV/OpVariable_order.ll b/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
index a4ca3aa709f0..6057bf38d4c4 100644
--- a/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
+++ b/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
@@ -1,10 +1,14 @@
-; REQUIRES: spirv-tools
-; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - -filetype=obj | not spirv-val 2>&1 | FileCheck %s
+; All OpVariable instructions in a function must be the first instructions in the first block
 
-; TODO(#66261): The SPIR-V backend should reorder OpVariable instructions so this doesn't fail,
-;     but in the meantime it's a good example of the spirv-val tool working as intended.
+; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-linux %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: All OpVariable instructions in a function must be the first instructions in the first block.
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV-NEXT: OpLabel
+; CHECK-SPIRV-NEXT: OpVariable
+; CHECK-SPIRV-NEXT: OpVariable
+; CHECK-SPIRV: OpReturn
+; CHECK-SPIRV: OpFunctionEnd
 
 define void @main() #1 {
 entry:
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
index 1071d3443056..b039f80860da 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
@@ -10,22 +10,46 @@
 ; CHECK-SPIRV-DAG: OpName %[[FooObj:.*]] "foo_object"
 ; CHECK-SPIRV-DAG: OpName %[[FooMemOrder:.*]] "mem_order"
 ; CHECK-SPIRV-DAG: OpName %[[FooFunc:.*]] "foo"
+
 ; CHECK-SPIRV-DAG: %[[TyLong:.*]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[TyGenPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
 ; CHECK-SPIRV-DAG: %[[TyPtrLong:.*]] = OpTypePointer CrossWorkgroup %[[TyLong]]
 ; CHECK-SPIRV-DAG: %[[TyFunPtrLong:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtrLong]]
-; CHECK-SPIRV-DAG: %[[TyGenPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrLong:.*]] = OpTypePointer Generic %[[TyGenPtrLong]]
 ; CHECK-SPIRV-DAG: %[[TyFunGenPtrLongLong:.*]] = OpTypeFunction %[[TyVoid]] %[[TyGenPtrLong]] %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[TyChar:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[TyGenPtrChar:.*]] = OpTypePointer Generic %[[TyChar]]
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrChar:.*]] = OpTypePointer Generic %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyFunPtrGenPtrChar:.*]] = OpTypePointer Function %[[TyGenPtrChar]]
 ; CHECK-SPIRV-DAG: %[[Const3:.*]] = OpConstant %[[TyLong]] 3
+
 ; CHECK-SPIRV: %[[FunTest]] = OpFunction %[[TyVoid]] None %[[TyFunPtrLong]]
 ; CHECK-SPIRV: %[[ArgCum]] = OpFunctionParameter %[[TyPtrLong]]
+
 ; CHECK-SPIRV: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[Addr]] %[[Const3]]
+
+; CHECK-SPIRV: %[[HalfAddr:.*]] = OpPtrCastToGeneric
+; CHECK-SPIRV-NEXT: %[[HalfAddrCasted:.*]] = OpBitcast %[[TyGenPtrLong]] %[[HalfAddr]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[HalfAddrCasted]] %[[Const3]]
+
+; CHECK-SPIRV: %[[DblAddr:.*]] = OpPtrCastToGeneric
+; CHECK-SPIRV-NEXT: %[[DblAddrCasted:.*]] = OpBitcast %[[TyGenPtrLong]] %[[DblAddr]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[DblAddrCasted]] %[[Const3]]
+
 ; CHECK-SPIRV: %[[FooStub]] = OpFunction %[[TyVoid]] None %[[TyFunGenPtrLongLong]]
 ; CHECK-SPIRV: %[[StubObj]] = OpFunctionParameter %[[TyGenPtrLong]]
 ; CHECK-SPIRV: %[[MemOrder]] = OpFunctionParameter %[[TyLong]]
+
+; CHECK-SPIRV: %[[ObjectAddr:.*]] = OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: %[[ToGeneric:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[ObjectAddr]]
+; CHECK-SPIRV-NEXT: %[[Casted:.*]] = OpBitcast %[[TyGenPtrPtrLong]] %[[ToGeneric]]
+; CHECK-SPIRV-NEXT: OpStore %[[Casted]] %[[StubObj]]
+
 ; CHECK-SPIRV: %[[FooFunc]] = OpFunction %[[TyVoid]] None %[[TyFunGenPtrLongLong]]
 ; CHECK-SPIRV: %[[FooObj]] = OpFunctionParameter %[[TyGenPtrLong]]
 ; CHECK-SPIRV: %[[FooMemOrder]] = OpFunctionParameter %[[TyLong]]
+
 ; CHECK-SPIRV: OpFunctionCall %[[TyVoid]] %[[FooStub]] %[[FooObj]] %[[FooMemOrder]]
 
 define spir_kernel void @test(ptr addrspace(1) noundef align 4 %_arg_cum) {
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll
new file mode 100644
index 000000000000..edb31ffeee8e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll
@@ -0,0 +1,60 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo"
+; CHECK-SPIRV-DAG: %[[TyChar:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[TyGenPtrChar:.*]] = OpTypePointer Generic %[[TyChar]]
+; CHECK-SPIRV-DAG: %[[TyFunBar:.*]] = OpTypeFunction %[[TyVoid]] %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyLong:.*]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrChar:.*]] = OpTypePointer Generic %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyFunFoo:.*]] = OpTypeFunction %[[TyVoid]] %[[TyLong]] %[[TyGenPtrPtrChar]] %[[TyGenPtrPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[Const100:.*]] = OpConstant %[[TyLong]] 100
+; CHECK-SPIRV-DAG: %[[TyFunPtrGenPtrChar:.*]] = OpTypePointer Function %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyPtrStruct:.*]] = OpTypePointer Generic %[[TyStruct]]
+; CHECK-SPIRV-DAG: %[[TyPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
+
+; CHECK-SPIRV: %[[Bar:.*]] = OpFunction %[[TyVoid]] None %[[TyFunBar]]
+; CHECK-SPIRV: %[[BarArg:.*]] = OpFunctionParameter %[[TyGenPtrChar]]
+; CHECK-SPIRV-NEXT: OpLabel
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV: %[[Var1:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[#]]
+; CHECK-SPIRV: %[[Var2:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[#]]
+; CHECK-SPIRV: OpStore %[[#]] %[[BarArg]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[Foo]] %[[Const100]] %[[Var1]] %[[Var2]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[Foo]] %[[Const100]] %[[Var2]] %[[Var1]]
+
+; CHECK-SPIRV: %[[Foo]] = OpFunction %[[TyVoid]] None %[[TyFunFoo]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyLong]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyGenPtrPtrChar]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyGenPtrPtrChar]]
+
+%class.CustomType = type { i64 }
+
+define linkonce_odr dso_local spir_func void @bar(ptr addrspace(4) noundef %first) {
+entry:
+  %first.addr = alloca ptr addrspace(4)
+  %first.addr.ascast = addrspacecast ptr %first.addr to ptr addrspace(4)
+  %temp = alloca ptr addrspace(4), align 8
+  %temp.ascast = addrspacecast ptr %temp to ptr addrspace(4)
+  store ptr addrspace(4) %first, ptr %first.addr
+  call spir_func void @foo(i64 noundef 100, ptr addrspace(4) noundef dereferenceable(8) %first.addr.ascast, ptr addrspace(4) noundef dereferenceable(8) %temp.ascast)
+  call spir_func void @foo(i64 noundef 100, ptr addrspace(4) noundef dereferenceable(8) %temp.ascast, ptr addrspace(4) noundef dereferenceable(8) %first.addr.ascast)
+  %var = alloca ptr addrspace(4), align 8
+  ret void
+}
+
+define linkonce_odr dso_local spir_func void @foo(i64 noundef %offset, ptr addrspace(4) noundef dereferenceable(8) %in_acc1, ptr addrspace(4) noundef dereferenceable(8) %out_acc1) {
+entry:
+  %r0 = load ptr addrspace(4), ptr addrspace(4) %in_acc1
+  %arrayidx = getelementptr inbounds %class.CustomType, ptr addrspace(4) %r0, i64 42
+  %r1 = load i64, ptr addrspace(4) %arrayidx
+  %r3 = load ptr addrspace(4), ptr addrspace(4) %out_acc1
+  %r4 = getelementptr %class.CustomType, ptr addrspace(4) %r3, i64 43
+  store i64 %r1, ptr addrspace(4) %r4
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/WebAssembly/multi-return.ll b/llvm/test/CodeGen/WebAssembly/multi-return.ll
index 3429cd512a46..293a1b35c39c 100644
--- a/llvm/test/CodeGen/WebAssembly/multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/multi-return.ll
@@ -78,18 +78,16 @@ define i64 @test4() {
 define { i64, i128 } @test5() {
 ; CHECK-LABEL: test5:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push8=, 8
-; CHECK: i32.add 	$push9=, $[[SP:[0-9]+]], $pop8
-; CHECK: i32.const	$push0=, 16
-; CHECK: i32.add 	$push1=, $pop9, $pop0
+; CHECK: i32.const	$push0=, 24
+; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 8($[[SP]])
 ; CHECK: i64.load	$push2=, 16($[[SP]])
 ; CHECK: i64.store	8($0), $pop2
+; CHECK: i64.store	16($0), $[[L1]]
 ; CHECK: i64.store	0($0), $[[L2]]
-; CHECK: i32.const	$push12=, 16
-; CHECK: i32.add 	$push3=, $0, $pop12
-; CHECK: i64.store	0($pop3), $[[L1]]
+; CHECK: i32.const	$push5=, 80
+; CHECK: i32.add 	$push6=, $3, $pop5
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
@@ -101,20 +99,20 @@ define { i64, i128 } @test5() {
 define { i128, i128 } @test6() {
 ; CHECK-LABEL: test6:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push0=, 64
+; CHECK: i32.const	$push0=, 24
 ; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i32.const	$push2=, 24
+; CHECK: i32.const	$push2=, 64
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
 ; CHECK: i64.load	$[[L3:[0-9]+]]=, 16($[[SP]])
 ; CHECK: i64.load	$push4=, 56($[[SP]])
 ; CHECK: i64.store	16($0), $pop4
+; CHECK: i64.store	24($0), $[[L2]]
 ; CHECK: i64.store	0($0), $[[L3]]
-; CHECK: i64.store	8($0), $[[L2]]
-; CHECK: i32.const	$push5=, 24
-; CHECK: i32.add 	$push6=, $0, $pop5
-; CHECK: i64.store	0($pop6), $[[L1]]
+; CHECK: i64.store	8($0), $[[L1]]
+; CHECK: i32.const	$push7=, 80
+; CHECK: i32.add	$push8=, $4, $pop7
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
   %r3 = extractvalue { i64, i128, i192, i128, i64 } %t0, 3
@@ -129,19 +127,17 @@ define { i64, i192 } @test7() {
 ; CHECK: i32.const	$push0=, 40
 ; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
+; CHECK: i64.load	$[[L2:[0-9]+]]=, 8($[[SP]])
+; CHECK: i64.load	$[[L3:[0-9]+]]=, 32($[[SP]])
 ; CHECK: i32.const	$push2=, 48
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
-; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
-; CHECK: i64.load	$[[L3:[0-9]+]]=, 8($[[SP]])
-; CHECK: i64.load	$push4=, 32($[[SP]])
-; CHECK: i64.store	8($0), $pop4
-; CHECK: i64.store	0($0), $[[L3]]
-; CHECK: i32.const	$push5=, 24
-; CHECK: i32.add 	$push6=, $0, $pop5
-; CHECK: i64.store	0($pop6), $[[L2]]
-; CHECK: i32.const	$push7=, 16
-; CHECK: i32.add 	$push8=, $0, $pop7
-; CHECK: i64.store	0($pop8), $[[L1]]
+; CHECK: i64.load	$push4=, 0($pop3)
+; CHECK: i64.store	24($0), $pop4
+; CHECK: i64.store	8($0), $[[L3]]
+; CHECK: i64.store	16($0), $[[L1]]
+; CHECK: i64.store	0($0), $[[L2]]
+; CHECK: i32.const	$push7=, 80
+; CHECK: i32.add 	$push8=, $4, $pop7
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r2 = extractvalue { i64, i128, i192, i128, i64 } %t0, 2
@@ -153,18 +149,16 @@ define { i64, i192 } @test7() {
 define { i128, i192, i128, i64 } @test8() {
 ; CHECK-LABEL: test8:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push18=, 8
-; CHECK: i32.add 	$push19=, $[[SP:[0-9]+]], $pop18
-; CHECK: i32.const	$push0=, 32
-; CHECK: i32.add 	$push1=, $pop19, $pop0
+; CHECK: i32.const	$push0=, 64
+; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i32.const	$push2=, 48
+; CHECK: i32.const	$push2=, 40
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
-; CHECK: i32.const	$push4=, 24
+; CHECK: i32.const	$push4=, 48
 ; CHECK: i32.add 	$push5=, $[[SP]], $pop4
 ; CHECK: i64.load	$[[L3:[0-9]+]]=, 0($pop5)
-; CHECK: i32.const	$push6=, 64
+; CHECK: i32.const	$push6=, 24
 ; CHECK: i32.add 	$push7=, $[[SP]], $pop6
 ; CHECK: i64.load	$[[L4:[0-9]+]]=, 0($pop7)
 ; CHECK: i64.load	$[[L5:[0-9]+]]=, 8($[[SP]])
@@ -172,19 +166,15 @@ define { i128, i192, i128, i64 } @test8() {
 ; CHECK: i64.load	$[[L7:[0-9]+]]=, 32($[[SP]])
 ; CHECK: i64.load	$push8=, 16($[[SP]])
 ; CHECK: i64.store	40($0), $pop8
+; CHECK: i64.store	48($0), $[[L4]]
+; CHECK: i64.store	32($0), $[[L3]]
 ; CHECK: i64.store	16($0), $[[L7]]
+; CHECK: i64.store	24($0), $[[L2]]
 ; CHECK: i64.store	0($0), $[[L6]]
-; CHECK: i64.store	8($0), $[[L4]]
+; CHECK: i64.store	8($0), $[[L1]]
 ; CHECK: i64.store	56($0), $[[L5]]
-; CHECK: i32.const	$push9=, 48
-; CHECK: i32.add 	$push10=, $0, $pop9
-; CHECK: i64.store	0($pop10), $[[L3]]
-; CHECK: i32.const	$push22=, 32
-; CHECK: i32.add 	$push11=, $0, $pop22
-; CHECK: i64.store	0($pop11), $[[L2]]
-; CHECK: i32.const	$push12=, 24
-; CHECK: i32.add 	$push13=, $0, $pop12
-; CHECK: i64.store	0($pop13), $[[L1]]
+; CHECK: i32.const	$push11=, 80
+; CHECK: i32.add 	$push12=, $8, $pop11
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 3a806b955f85..761a75418a00 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -31,60 +31,38 @@ define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: add_v16i8:
 ; NO-SIMD128:         .functype add_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.add $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.add $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.add $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.add $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.add $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.add $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.add $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.add $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.add $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.add $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.add $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.add $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.add $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.add $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.add $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.add $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.add $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.add $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.add $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.add $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v16i8:
@@ -96,54 +74,32 @@ define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.add $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -165,60 +121,38 @@ define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: sub_v16i8:
 ; NO-SIMD128:         .functype sub_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.sub $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.sub $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.sub $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.sub $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.sub $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.sub $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.sub $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.sub $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.sub $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.sub $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.sub $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.sub $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.sub $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.sub $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.sub $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.sub $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.sub $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.sub $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.sub $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.sub $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.sub $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.sub $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.sub $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.sub $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v16i8:
@@ -230,54 +164,32 @@ define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.sub $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.sub $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.sub $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.sub $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.sub $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.sub $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.sub $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.sub $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -425,60 +337,38 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: mul_v16i8:
 ; NO-SIMD128:         .functype mul_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.mul $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.mul $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.mul $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.mul $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.mul $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.mul $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.mul $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.mul $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.mul $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.mul $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.mul $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.mul $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.mul $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.mul $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.mul $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.mul $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.mul $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.mul $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.mul $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.mul $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.mul $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.mul $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.mul $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v16i8:
@@ -490,54 +380,32 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.mul $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.mul $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.mul $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.mul $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.mul $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -559,108 +427,86 @@ define <16 x i8> @min_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: min_s_v16i8:
 ; NO-SIMD128:         .functype min_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.lt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $16, $32, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $31
-; NO-SIMD128-NEXT:    i32.lt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $15, $31, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 13
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $30
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $31
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $15, $31, $pop6
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $30
+; NO-SIMD128-NEXT:    i32.lt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $14, $30, $pop10
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $29
 ; NO-SIMD128-NEXT:    i32.lt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $14, $30, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $29
-; NO-SIMD128-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $13, $29, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push28=, 11
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $28
+; NO-SIMD128-NEXT:    i32.select $push15=, $13, $29, $pop14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $28
+; NO-SIMD128-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $12, $28, $pop18
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $27
+; NO-SIMD128-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $11, $27, $pop22
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $26
 ; NO-SIMD128-NEXT:    i32.lt_s $push26=, $pop25, $pop24
-; NO-SIMD128-NEXT:    i32.select $push27=, $12, $28, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push34=, 10
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push30=, $27
-; NO-SIMD128-NEXT:    i32.lt_s $push32=, $pop31, $pop30
-; NO-SIMD128-NEXT:    i32.select $push33=, $11, $27, $pop32
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push40=, 9
-; NO-SIMD128-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-NEXT:    i32.select $push27=, $10, $26, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $25
+; NO-SIMD128-NEXT:    i32.lt_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.select $push31=, $9, $25, $pop30
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop31
+; NO-SIMD128-NEXT:    i32.extend8_s $push33=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $24
+; NO-SIMD128-NEXT:    i32.lt_s $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.select $push35=, $8, $24, $pop34
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop35
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $23
 ; NO-SIMD128-NEXT:    i32.lt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $10, $26, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop41), $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $25
-; NO-SIMD128-NEXT:    i32.lt_s $push44=, $pop43, $pop42
-; NO-SIMD128-NEXT:    i32.select $push45=, $9, $25, $pop44
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-NEXT:    i32.const $push50=, 7
-; NO-SIMD128-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $24
-; NO-SIMD128-NEXT:    i32.lt_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.select $push49=, $8, $24, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop51), $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 6
-; NO-SIMD128-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $23
+; NO-SIMD128-NEXT:    i32.select $push39=, $7, $23, $pop38
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $22
+; NO-SIMD128-NEXT:    i32.lt_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.select $push43=, $6, $22, $pop42
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop43
+; NO-SIMD128-NEXT:    i32.extend8_s $push45=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $21
+; NO-SIMD128-NEXT:    i32.lt_s $push46=, $pop45, $pop44
+; NO-SIMD128-NEXT:    i32.select $push47=, $5, $21, $pop46
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop47
+; NO-SIMD128-NEXT:    i32.extend8_s $push49=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $20
+; NO-SIMD128-NEXT:    i32.lt_s $push50=, $pop49, $pop48
+; NO-SIMD128-NEXT:    i32.select $push51=, $4, $20, $pop50
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop51
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $19
 ; NO-SIMD128-NEXT:    i32.lt_s $push54=, $pop53, $pop52
-; NO-SIMD128-NEXT:    i32.select $push55=, $7, $23, $pop54
-; NO-SIMD128-NEXT:    i32.store8 0($pop57), $pop55
-; NO-SIMD128-NEXT:    i32.const $push62=, 5
-; NO-SIMD128-NEXT:    i32.add $push63=, $0, $pop62
-; NO-SIMD128-NEXT:    i32.extend8_s $push59=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push58=, $22
-; NO-SIMD128-NEXT:    i32.lt_s $push60=, $pop59, $pop58
-; NO-SIMD128-NEXT:    i32.select $push61=, $6, $22, $pop60
-; NO-SIMD128-NEXT:    i32.store8 0($pop63), $pop61
-; NO-SIMD128-NEXT:    i32.extend8_s $push65=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $21
-; NO-SIMD128-NEXT:    i32.lt_s $push66=, $pop65, $pop64
-; NO-SIMD128-NEXT:    i32.select $push67=, $5, $21, $pop66
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop67
-; NO-SIMD128-NEXT:    i32.const $push72=, 3
-; NO-SIMD128-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-NEXT:    i32.extend8_s $push69=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push68=, $20
-; NO-SIMD128-NEXT:    i32.lt_s $push70=, $pop69, $pop68
-; NO-SIMD128-NEXT:    i32.select $push71=, $4, $20, $pop70
-; NO-SIMD128-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-NEXT:    i32.extend8_s $push75=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push74=, $19
-; NO-SIMD128-NEXT:    i32.lt_s $push76=, $pop75, $pop74
-; NO-SIMD128-NEXT:    i32.select $push77=, $3, $19, $pop76
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop77
-; NO-SIMD128-NEXT:    i32.extend8_s $push79=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push78=, $18
-; NO-SIMD128-NEXT:    i32.lt_s $push80=, $pop79, $pop78
-; NO-SIMD128-NEXT:    i32.select $push81=, $2, $18, $pop80
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop81
-; NO-SIMD128-NEXT:    i32.extend8_s $push83=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push82=, $17
-; NO-SIMD128-NEXT:    i32.lt_s $push84=, $pop83, $pop82
-; NO-SIMD128-NEXT:    i32.select $push85=, $1, $17, $pop84
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop85
+; NO-SIMD128-NEXT:    i32.select $push55=, $3, $19, $pop54
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop55
+; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $18
+; NO-SIMD128-NEXT:    i32.lt_s $push58=, $pop57, $pop56
+; NO-SIMD128-NEXT:    i32.select $push59=, $2, $18, $pop58
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop59
+; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push60=, $17
+; NO-SIMD128-NEXT:    i32.lt_s $push62=, $pop61, $pop60
+; NO-SIMD128-NEXT:    i32.select $push63=, $1, $17, $pop62
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop63
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v16i8:
@@ -681,93 +527,71 @@ define <16 x i8> @min_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $21
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $21
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $22
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $24
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $23, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $24
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $25
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $24, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $25
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.select $push35=, $9, $25, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push38=, $pop37, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.select $push39=, $10, $26, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push42=, $pop41, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $9, $25, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $26
+; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $11, $27, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $28
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push46=, $pop45, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $10, $26, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push51=, $11
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $27
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push52=, $pop51, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.select $push53=, $11, $27, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $28
+; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $12, $28, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push48=, $29
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push50=, $pop49, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.select $push51=, $13, $29, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push52=, $30
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.select $push55=, $14, $30, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop55
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $31
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $12, $28, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push63=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push62=, $29
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push64=, $pop63, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.select $push65=, $13, $29, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push68=, $30
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.select $push71=, $14, $30, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $0, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push75=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push74=, $31
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push76=, $pop75, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.select $push77=, $15, $31, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop79), $pop77
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $0, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push81=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push80=, $32
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push82=, $pop81, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.select $push83=, $16, $32, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop85), $pop83
+; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $15, $31, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push61=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $32
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push62=, $pop61, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.select $push63=, $16, $32, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop63
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -790,140 +614,118 @@ define <16 x i8> @min_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: min_u_v16i8:
 ; NO-SIMD128:         .functype min_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
-; NO-SIMD128-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop117
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop95
 ; NO-SIMD128-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $16, $32, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop115
-; NO-SIMD128-NEXT:    i32.lt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $15, $31, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $14, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $30, $pop113
-; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $14, $30, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-NEXT:    i32.and $push20=, $13, $pop112
-; NO-SIMD128-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $29, $pop111
-; NO-SIMD128-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $13, $29, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $12, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-NEXT:    i32.and $push25=, $28, $pop109
-; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.select $push28=, $12, $28, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop107
-; NO-SIMD128-NEXT:    i32.lt_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.select $push34=, $11, $27, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-NEXT:    i32.and $push38=, $10, $pop106
-; NO-SIMD128-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $26, $pop105
-; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $10, $26, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-NEXT:    i32.and $push44=, $9, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $25, $pop103
-; NO-SIMD128-NEXT:    i32.lt_u $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.select $push46=, $9, $25, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-NEXT:    i32.and $push48=, $8, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $24, $pop101
-; NO-SIMD128-NEXT:    i32.lt_u $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.select $push50=, $8, $24, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push54=, $7, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push53=, $23, $pop99
-; NO-SIMD128-NEXT:    i32.lt_u $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.select $push56=, $7, $23, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push60=, $6, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push59=, $22, $pop97
-; NO-SIMD128-NEXT:    i32.lt_u $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.select $push62=, $6, $22, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $5, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push65=, $21, $pop95
-; NO-SIMD128-NEXT:    i32.lt_u $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.select $push68=, $5, $21, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push70=, $4, $pop94
+; NO-SIMD128-NEXT:    i32.and $push6=, $15, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push69=, $20, $pop93
-; NO-SIMD128-NEXT:    i32.lt_u $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.select $push72=, $4, $20, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.and $push5=, $31, $pop93
+; NO-SIMD128-NEXT:    i32.lt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $15, $31, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push76=, $3, $pop92
+; NO-SIMD128-NEXT:    i32.and $push10=, $14, $pop92
 ; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push75=, $19, $pop91
-; NO-SIMD128-NEXT:    i32.lt_u $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.select $push78=, $3, $19, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.and $push9=, $30, $pop91
+; NO-SIMD128-NEXT:    i32.lt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $14, $30, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push80=, $2, $pop90
+; NO-SIMD128-NEXT:    i32.and $push14=, $13, $pop90
 ; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push79=, $18, $pop89
-; NO-SIMD128-NEXT:    i32.lt_u $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.select $push82=, $2, $18, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.and $push13=, $29, $pop89
+; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $13, $29, $pop15
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
 ; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push84=, $1, $pop88
+; NO-SIMD128-NEXT:    i32.and $push18=, $12, $pop88
 ; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push83=, $17, $pop87
-; NO-SIMD128-NEXT:    i32.lt_u $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.select $push86=, $1, $17, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.and $push17=, $28, $pop87
+; NO-SIMD128-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $12, $28, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $11, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $27, $pop85
+; NO-SIMD128-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $11, $27, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $10, $pop84
+; NO-SIMD128-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $26, $pop83
+; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $10, $26, $pop27
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-NEXT:    i32.and $push30=, $9, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $25, $pop81
+; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $9, $25, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $8, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $24, $pop79
+; NO-SIMD128-NEXT:    i32.lt_u $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.select $push36=, $8, $24, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $7, $pop78
+; NO-SIMD128-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $23, $pop77
+; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $7, $23, $pop39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $6, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $22, $pop75
+; NO-SIMD128-NEXT:    i32.lt_u $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.select $push44=, $6, $22, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $5, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-NEXT:    i32.and $push45=, $21, $pop73
+; NO-SIMD128-NEXT:    i32.lt_u $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.select $push48=, $5, $21, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push50=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $20, $pop71
+; NO-SIMD128-NEXT:    i32.lt_u $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.select $push52=, $4, $20, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push54=, $3, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $19, $pop69
+; NO-SIMD128-NEXT:    i32.lt_u $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.select $push56=, $3, $19, $pop55
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $2, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $18, $pop67
+; NO-SIMD128-NEXT:    i32.lt_u $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.select $push60=, $2, $18, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push61=, $17, $pop65
+; NO-SIMD128-NEXT:    i32.lt_u $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.select $push64=, $1, $17, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v16i8:
@@ -931,138 +733,116 @@ define <16 x i8> @min_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop95
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $17, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop93
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $18, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $19, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop89
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $20, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $21, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $22, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $23, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $21, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $21, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $22, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $22, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $24, $pop81
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $23, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $24, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $24, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $9, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $25, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $24, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.select $push36=, $9, $25, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $10, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $26, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push40=, $10, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop75
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $9, $25, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $10, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $26, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $11, $27, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $12, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $28, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $10, $26, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $11, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $27, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.select $push54=, $11, $27, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $12, $28, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $13, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $29, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.select $push52=, $13, $29, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $14, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $30, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.select $push56=, $14, $30, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $15, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $31, $pop67
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $12, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $13, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $29, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.select $push66=, $13, $29, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push70=, $14, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $30, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.select $push72=, $14, $30, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push76=, $15, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $31, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.select $push78=, $15, $31, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push82=, $16, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $32, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.select $push84=, $16, $32, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $15, $31, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $16, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $32, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.select $push64=, $16, $32, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1085,108 +865,86 @@ define <16 x i8> @max_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: max_s_v16i8:
 ; NO-SIMD128:         .functype max_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.gt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $16, $32, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $31
-; NO-SIMD128-NEXT:    i32.gt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $15, $31, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 13
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $30
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $31
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $15, $31, $pop6
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $30
+; NO-SIMD128-NEXT:    i32.gt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $14, $30, $pop10
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $29
 ; NO-SIMD128-NEXT:    i32.gt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $14, $30, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $29
-; NO-SIMD128-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $13, $29, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push28=, 11
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $28
+; NO-SIMD128-NEXT:    i32.select $push15=, $13, $29, $pop14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $28
+; NO-SIMD128-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $12, $28, $pop18
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $27
+; NO-SIMD128-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $11, $27, $pop22
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $26
 ; NO-SIMD128-NEXT:    i32.gt_s $push26=, $pop25, $pop24
-; NO-SIMD128-NEXT:    i32.select $push27=, $12, $28, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push34=, 10
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push30=, $27
-; NO-SIMD128-NEXT:    i32.gt_s $push32=, $pop31, $pop30
-; NO-SIMD128-NEXT:    i32.select $push33=, $11, $27, $pop32
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push40=, 9
-; NO-SIMD128-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-NEXT:    i32.select $push27=, $10, $26, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $25
+; NO-SIMD128-NEXT:    i32.gt_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.select $push31=, $9, $25, $pop30
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop31
+; NO-SIMD128-NEXT:    i32.extend8_s $push33=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $24
+; NO-SIMD128-NEXT:    i32.gt_s $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.select $push35=, $8, $24, $pop34
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop35
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $23
 ; NO-SIMD128-NEXT:    i32.gt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $10, $26, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop41), $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $25
-; NO-SIMD128-NEXT:    i32.gt_s $push44=, $pop43, $pop42
-; NO-SIMD128-NEXT:    i32.select $push45=, $9, $25, $pop44
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-NEXT:    i32.const $push50=, 7
-; NO-SIMD128-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $24
-; NO-SIMD128-NEXT:    i32.gt_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.select $push49=, $8, $24, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop51), $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 6
-; NO-SIMD128-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $23
+; NO-SIMD128-NEXT:    i32.select $push39=, $7, $23, $pop38
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $22
+; NO-SIMD128-NEXT:    i32.gt_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.select $push43=, $6, $22, $pop42
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop43
+; NO-SIMD128-NEXT:    i32.extend8_s $push45=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $21
+; NO-SIMD128-NEXT:    i32.gt_s $push46=, $pop45, $pop44
+; NO-SIMD128-NEXT:    i32.select $push47=, $5, $21, $pop46
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop47
+; NO-SIMD128-NEXT:    i32.extend8_s $push49=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $20
+; NO-SIMD128-NEXT:    i32.gt_s $push50=, $pop49, $pop48
+; NO-SIMD128-NEXT:    i32.select $push51=, $4, $20, $pop50
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop51
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $19
 ; NO-SIMD128-NEXT:    i32.gt_s $push54=, $pop53, $pop52
-; NO-SIMD128-NEXT:    i32.select $push55=, $7, $23, $pop54
-; NO-SIMD128-NEXT:    i32.store8 0($pop57), $pop55
-; NO-SIMD128-NEXT:    i32.const $push62=, 5
-; NO-SIMD128-NEXT:    i32.add $push63=, $0, $pop62
-; NO-SIMD128-NEXT:    i32.extend8_s $push59=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push58=, $22
-; NO-SIMD128-NEXT:    i32.gt_s $push60=, $pop59, $pop58
-; NO-SIMD128-NEXT:    i32.select $push61=, $6, $22, $pop60
-; NO-SIMD128-NEXT:    i32.store8 0($pop63), $pop61
-; NO-SIMD128-NEXT:    i32.extend8_s $push65=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $21
-; NO-SIMD128-NEXT:    i32.gt_s $push66=, $pop65, $pop64
-; NO-SIMD128-NEXT:    i32.select $push67=, $5, $21, $pop66
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop67
-; NO-SIMD128-NEXT:    i32.const $push72=, 3
-; NO-SIMD128-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-NEXT:    i32.extend8_s $push69=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push68=, $20
-; NO-SIMD128-NEXT:    i32.gt_s $push70=, $pop69, $pop68
-; NO-SIMD128-NEXT:    i32.select $push71=, $4, $20, $pop70
-; NO-SIMD128-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-NEXT:    i32.extend8_s $push75=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push74=, $19
-; NO-SIMD128-NEXT:    i32.gt_s $push76=, $pop75, $pop74
-; NO-SIMD128-NEXT:    i32.select $push77=, $3, $19, $pop76
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop77
-; NO-SIMD128-NEXT:    i32.extend8_s $push79=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push78=, $18
-; NO-SIMD128-NEXT:    i32.gt_s $push80=, $pop79, $pop78
-; NO-SIMD128-NEXT:    i32.select $push81=, $2, $18, $pop80
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop81
-; NO-SIMD128-NEXT:    i32.extend8_s $push83=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push82=, $17
-; NO-SIMD128-NEXT:    i32.gt_s $push84=, $pop83, $pop82
-; NO-SIMD128-NEXT:    i32.select $push85=, $1, $17, $pop84
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop85
+; NO-SIMD128-NEXT:    i32.select $push55=, $3, $19, $pop54
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop55
+; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $18
+; NO-SIMD128-NEXT:    i32.gt_s $push58=, $pop57, $pop56
+; NO-SIMD128-NEXT:    i32.select $push59=, $2, $18, $pop58
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop59
+; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push60=, $17
+; NO-SIMD128-NEXT:    i32.gt_s $push62=, $pop61, $pop60
+; NO-SIMD128-NEXT:    i32.select $push63=, $1, $17, $pop62
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop63
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v16i8:
@@ -1207,93 +965,71 @@ define <16 x i8> @max_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $21
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $21
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $22
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $24
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $23, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $24
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $25
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $24, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $25
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.select $push35=, $9, $25, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push38=, $pop37, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.select $push39=, $10, $26, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push42=, $pop41, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $9, $25, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $26
+; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $11, $27, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $28
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push46=, $pop45, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $10, $26, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push51=, $11
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $27
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push52=, $pop51, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.select $push53=, $11, $27, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $28
+; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $12, $28, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push48=, $29
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push50=, $pop49, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.select $push51=, $13, $29, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push52=, $30
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.select $push55=, $14, $30, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop55
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $31
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $12, $28, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push63=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push62=, $29
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push64=, $pop63, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.select $push65=, $13, $29, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push68=, $30
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.select $push71=, $14, $30, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $0, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push75=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push74=, $31
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push76=, $pop75, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.select $push77=, $15, $31, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop79), $pop77
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $0, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push81=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push80=, $32
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push82=, $pop81, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.select $push83=, $16, $32, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop85), $pop83
+; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $15, $31, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push61=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $32
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push62=, $pop61, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.select $push63=, $16, $32, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop63
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1316,140 +1052,118 @@ define <16 x i8> @max_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: max_u_v16i8:
 ; NO-SIMD128:         .functype max_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
-; NO-SIMD128-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop117
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop95
 ; NO-SIMD128-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $16, $32, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop115
-; NO-SIMD128-NEXT:    i32.gt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $15, $31, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $14, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $30, $pop113
-; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $14, $30, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-NEXT:    i32.and $push20=, $13, $pop112
-; NO-SIMD128-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $29, $pop111
-; NO-SIMD128-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $13, $29, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $12, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-NEXT:    i32.and $push25=, $28, $pop109
-; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.select $push28=, $12, $28, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop107
-; NO-SIMD128-NEXT:    i32.gt_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.select $push34=, $11, $27, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-NEXT:    i32.and $push38=, $10, $pop106
-; NO-SIMD128-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $26, $pop105
-; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $10, $26, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-NEXT:    i32.and $push44=, $9, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $25, $pop103
-; NO-SIMD128-NEXT:    i32.gt_u $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.select $push46=, $9, $25, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-NEXT:    i32.and $push48=, $8, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $24, $pop101
-; NO-SIMD128-NEXT:    i32.gt_u $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.select $push50=, $8, $24, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push54=, $7, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push53=, $23, $pop99
-; NO-SIMD128-NEXT:    i32.gt_u $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.select $push56=, $7, $23, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push60=, $6, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push59=, $22, $pop97
-; NO-SIMD128-NEXT:    i32.gt_u $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.select $push62=, $6, $22, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $5, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push65=, $21, $pop95
-; NO-SIMD128-NEXT:    i32.gt_u $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.select $push68=, $5, $21, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push70=, $4, $pop94
+; NO-SIMD128-NEXT:    i32.and $push6=, $15, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push69=, $20, $pop93
-; NO-SIMD128-NEXT:    i32.gt_u $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.select $push72=, $4, $20, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.and $push5=, $31, $pop93
+; NO-SIMD128-NEXT:    i32.gt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $15, $31, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push76=, $3, $pop92
+; NO-SIMD128-NEXT:    i32.and $push10=, $14, $pop92
 ; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push75=, $19, $pop91
-; NO-SIMD128-NEXT:    i32.gt_u $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.select $push78=, $3, $19, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.and $push9=, $30, $pop91
+; NO-SIMD128-NEXT:    i32.gt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $14, $30, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push80=, $2, $pop90
+; NO-SIMD128-NEXT:    i32.and $push14=, $13, $pop90
 ; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push79=, $18, $pop89
-; NO-SIMD128-NEXT:    i32.gt_u $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.select $push82=, $2, $18, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.and $push13=, $29, $pop89
+; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $13, $29, $pop15
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
 ; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push84=, $1, $pop88
+; NO-SIMD128-NEXT:    i32.and $push18=, $12, $pop88
 ; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push83=, $17, $pop87
-; NO-SIMD128-NEXT:    i32.gt_u $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.select $push86=, $1, $17, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.and $push17=, $28, $pop87
+; NO-SIMD128-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $12, $28, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $11, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $27, $pop85
+; NO-SIMD128-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $11, $27, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $10, $pop84
+; NO-SIMD128-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $26, $pop83
+; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $10, $26, $pop27
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-NEXT:    i32.and $push30=, $9, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $25, $pop81
+; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $9, $25, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $8, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $24, $pop79
+; NO-SIMD128-NEXT:    i32.gt_u $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.select $push36=, $8, $24, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $7, $pop78
+; NO-SIMD128-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $23, $pop77
+; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $7, $23, $pop39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $6, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $22, $pop75
+; NO-SIMD128-NEXT:    i32.gt_u $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.select $push44=, $6, $22, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $5, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-NEXT:    i32.and $push45=, $21, $pop73
+; NO-SIMD128-NEXT:    i32.gt_u $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.select $push48=, $5, $21, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push50=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $20, $pop71
+; NO-SIMD128-NEXT:    i32.gt_u $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.select $push52=, $4, $20, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push54=, $3, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $19, $pop69
+; NO-SIMD128-NEXT:    i32.gt_u $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.select $push56=, $3, $19, $pop55
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $2, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $18, $pop67
+; NO-SIMD128-NEXT:    i32.gt_u $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.select $push60=, $2, $18, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push61=, $17, $pop65
+; NO-SIMD128-NEXT:    i32.gt_u $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.select $push64=, $1, $17, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v16i8:
@@ -1457,138 +1171,116 @@ define <16 x i8> @max_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop95
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $17, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop93
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $18, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $19, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop89
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $20, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $21, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $22, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $23, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $21, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $21, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $22, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $22, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $24, $pop81
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $23, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $24, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $24, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $9, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $25, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $24, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.select $push36=, $9, $25, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $10, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $26, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push40=, $10, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop75
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $9, $25, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $10, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $26, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $11, $27, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $12, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $28, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $10, $26, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $11, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $27, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.select $push54=, $11, $27, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $12, $28, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $13, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $29, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.select $push52=, $13, $29, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $14, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $30, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.select $push56=, $14, $30, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $15, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $31, $pop67
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $12, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $13, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $29, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.select $push66=, $13, $29, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push70=, $14, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $30, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.select $push72=, $14, $30, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push76=, $15, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $31, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.select $push78=, $15, $31, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push82=, $16, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $32, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.select $push84=, $16, $32, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $15, $31, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $16, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $32, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.select $push64=, $16, $32, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1611,156 +1303,134 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v16i8:
 ; NO-SIMD128:         .functype avgr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 254
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop133
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop132
-; NO-SIMD128-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop131
-; NO-SIMD128-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop130
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop129
-; NO-SIMD128-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop128
-; NO-SIMD128-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop127
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.add $push22=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop126
-; NO-SIMD128-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop125
-; NO-SIMD128-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop124
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $pop25
-; NO-SIMD128-NEXT:    i32.const $push26=, 11
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.add $push28=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop123
-; NO-SIMD128-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $pop122
-; NO-SIMD128-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop121
-; NO-SIMD128-NEXT:    i32.store8 0($pop27), $pop31
-; NO-SIMD128-NEXT:    i32.const $push32=, 10
-; NO-SIMD128-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-NEXT:    i32.add $push34=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop120
-; NO-SIMD128-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop119
-; NO-SIMD128-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop118
-; NO-SIMD128-NEXT:    i32.store8 0($pop33), $pop37
-; NO-SIMD128-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-NEXT:    i32.add $push40=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-NEXT:    i32.add $push41=, $pop40, $pop117
-; NO-SIMD128-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-NEXT:    i32.and $push42=, $pop41, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop115
-; NO-SIMD128-NEXT:    i32.store8 0($pop39), $pop43
-; NO-SIMD128-NEXT:    i32.add $push44=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $pop113
-; NO-SIMD128-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push47=, $pop46, $pop112
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop47
-; NO-SIMD128-NEXT:    i32.const $push48=, 7
-; NO-SIMD128-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-NEXT:    i32.add $push50=, $8, $24
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 254
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop111
-; NO-SIMD128-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop109
-; NO-SIMD128-NEXT:    i32.store8 0($pop49), $pop53
-; NO-SIMD128-NEXT:    i32.const $push54=, 6
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.add $push56=, $7, $23
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop111
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop109
 ; NO-SIMD128-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-NEXT:    i32.add $push57=, $pop56, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-NEXT:    i32.and $push58=, $pop57, $pop107
-; NO-SIMD128-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop106
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop59
-; NO-SIMD128-NEXT:    i32.const $push60=, 5
-; NO-SIMD128-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-NEXT:    i32.add $push62=, $6, $22
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop108
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop105
-; NO-SIMD128-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop103
-; NO-SIMD128-NEXT:    i32.store8 0($pop61), $pop65
-; NO-SIMD128-NEXT:    i32.add $push66=, $5, $21
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-NEXT:    i32.add $push67=, $pop66, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $pop101
-; NO-SIMD128-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop100
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop69
-; NO-SIMD128-NEXT:    i32.const $push70=, 3
-; NO-SIMD128-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-NEXT:    i32.add $push72=, $4, $20
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-NEXT:    i32.add $push73=, $pop72, $pop99
-; NO-SIMD128-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-NEXT:    i32.and $push74=, $pop73, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push75=, $pop74, $pop97
-; NO-SIMD128-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-NEXT:    i32.add $push76=, $3, $19
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-NEXT:    i32.add $push77=, $pop76, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-NEXT:    i32.and $push78=, $pop77, $pop95
-; NO-SIMD128-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop94
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop79
-; NO-SIMD128-NEXT:    i32.add $push80=, $2, $18
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-NEXT:    i32.add $push81=, $pop80, $pop93
-; NO-SIMD128-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-NEXT:    i32.and $push82=, $pop81, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push83=, $pop82, $pop91
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop83
-; NO-SIMD128-NEXT:    i32.add $push84=, $1, $17
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop87
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop33
+; NO-SIMD128-NEXT:    i32.add $push34=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop37
+; NO-SIMD128-NEXT:    i32.add $push38=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push42=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop45
+; NO-SIMD128-NEXT:    i32.add $push46=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop49
+; NO-SIMD128-NEXT:    i32.add $push50=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop53
+; NO-SIMD128-NEXT:    i32.add $push54=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop57
+; NO-SIMD128-NEXT:    i32.add $push58=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop61
+; NO-SIMD128-NEXT:    i32.add $push62=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop65
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v16i8:
@@ -1771,151 +1441,129 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 254
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop111
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop132
-; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop131
-; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop108
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop129
-; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop128
-; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop127
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop126
-; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop125
-; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop124
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop123
-; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop122
-; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop121
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop120
-; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop119
-; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop118
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop115
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop113
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop41
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $pop49, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push53=, $0, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop53), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $pop60, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $pop61, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $pop66, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push69=, $pop68, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop69
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.add $push72=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $pop72, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push74=, $pop73, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push77=, $0, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.add $push78=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push80=, $pop79, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop77), $pop81
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push83=, $0, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.add $push84=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop83), $pop87
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.add $push58=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop65
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add nuw <16 x i8> %x, %y
   %b = add nuw <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
@@ -1949,156 +1597,134 @@ define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v16i8_wrap:
 ; NO-SIMD128:         .functype avgr_u_v16i8_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 254
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop133
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop132
-; NO-SIMD128-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop131
-; NO-SIMD128-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop130
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop129
-; NO-SIMD128-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop128
-; NO-SIMD128-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop127
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.add $push22=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop126
-; NO-SIMD128-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop125
-; NO-SIMD128-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop124
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $pop25
-; NO-SIMD128-NEXT:    i32.const $push26=, 11
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.add $push28=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop123
-; NO-SIMD128-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $pop122
-; NO-SIMD128-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop121
-; NO-SIMD128-NEXT:    i32.store8 0($pop27), $pop31
-; NO-SIMD128-NEXT:    i32.const $push32=, 10
-; NO-SIMD128-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-NEXT:    i32.add $push34=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop120
-; NO-SIMD128-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop119
-; NO-SIMD128-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop118
-; NO-SIMD128-NEXT:    i32.store8 0($pop33), $pop37
-; NO-SIMD128-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-NEXT:    i32.add $push40=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-NEXT:    i32.add $push41=, $pop40, $pop117
-; NO-SIMD128-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-NEXT:    i32.and $push42=, $pop41, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop115
-; NO-SIMD128-NEXT:    i32.store8 0($pop39), $pop43
-; NO-SIMD128-NEXT:    i32.add $push44=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $pop113
-; NO-SIMD128-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push47=, $pop46, $pop112
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop47
-; NO-SIMD128-NEXT:    i32.const $push48=, 7
-; NO-SIMD128-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-NEXT:    i32.add $push50=, $8, $24
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 254
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop111
-; NO-SIMD128-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop109
-; NO-SIMD128-NEXT:    i32.store8 0($pop49), $pop53
-; NO-SIMD128-NEXT:    i32.const $push54=, 6
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.add $push56=, $7, $23
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop111
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop109
 ; NO-SIMD128-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-NEXT:    i32.add $push57=, $pop56, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-NEXT:    i32.and $push58=, $pop57, $pop107
-; NO-SIMD128-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop106
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop59
-; NO-SIMD128-NEXT:    i32.const $push60=, 5
-; NO-SIMD128-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-NEXT:    i32.add $push62=, $6, $22
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop108
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop105
-; NO-SIMD128-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop103
-; NO-SIMD128-NEXT:    i32.store8 0($pop61), $pop65
-; NO-SIMD128-NEXT:    i32.add $push66=, $5, $21
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-NEXT:    i32.add $push67=, $pop66, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $pop101
-; NO-SIMD128-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop100
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop69
-; NO-SIMD128-NEXT:    i32.const $push70=, 3
-; NO-SIMD128-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-NEXT:    i32.add $push72=, $4, $20
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-NEXT:    i32.add $push73=, $pop72, $pop99
-; NO-SIMD128-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-NEXT:    i32.and $push74=, $pop73, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push75=, $pop74, $pop97
-; NO-SIMD128-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-NEXT:    i32.add $push76=, $3, $19
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-NEXT:    i32.add $push77=, $pop76, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-NEXT:    i32.and $push78=, $pop77, $pop95
-; NO-SIMD128-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop94
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop79
-; NO-SIMD128-NEXT:    i32.add $push80=, $2, $18
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-NEXT:    i32.add $push81=, $pop80, $pop93
-; NO-SIMD128-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-NEXT:    i32.and $push82=, $pop81, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push83=, $pop82, $pop91
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop83
-; NO-SIMD128-NEXT:    i32.add $push84=, $1, $17
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop87
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop33
+; NO-SIMD128-NEXT:    i32.add $push34=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop37
+; NO-SIMD128-NEXT:    i32.add $push38=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push42=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop45
+; NO-SIMD128-NEXT:    i32.add $push46=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop49
+; NO-SIMD128-NEXT:    i32.add $push50=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop53
+; NO-SIMD128-NEXT:    i32.add $push54=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop57
+; NO-SIMD128-NEXT:    i32.add $push58=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop61
+; NO-SIMD128-NEXT:    i32.add $push62=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop65
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v16i8_wrap:
@@ -2109,151 +1735,129 @@ define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 254
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop111
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop132
-; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop131
-; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop108
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop129
-; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop128
-; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop127
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop126
-; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop125
-; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop124
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop123
-; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop122
-; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop121
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop120
-; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop119
-; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop118
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop115
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop113
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop41
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $pop49, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push53=, $0, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop53), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $pop60, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $pop61, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $pop66, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push69=, $pop68, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop69
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.add $push72=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $pop72, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push74=, $pop73, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push77=, $0, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.add $push78=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push80=, $pop79, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop77), $pop81
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push83=, $0, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.add $push84=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop83), $pop87
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.add $push58=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop65
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <16 x i8> %x, %y
   %b = add <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
@@ -2279,140 +1883,118 @@ define <16 x i8> @abs_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-LABEL: abs_v16i8:
 ; NO-SIMD128:         .functype abs_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.const $push1=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push117=, $pop0, $pop1
-; NO-SIMD128-NEXT:    local.tee $push116=, $17=, $pop117
-; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop116
+; NO-SIMD128-NEXT:    i32.shr_s $push95=, $pop0, $pop1
+; NO-SIMD128-NEXT:    local.tee $push94=, $17=, $pop95
+; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop94
 ; NO-SIMD128-NEXT:    i32.sub $push3=, $pop2, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.const $push115=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push114=, $pop6, $pop115
-; NO-SIMD128-NEXT:    local.tee $push113=, $16=, $pop114
-; NO-SIMD128-NEXT:    i32.xor $push7=, $15, $pop113
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $14
-; NO-SIMD128-NEXT:    i32.const $push112=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push111=, $pop11, $pop112
-; NO-SIMD128-NEXT:    local.tee $push110=, $16=, $pop111
-; NO-SIMD128-NEXT:    i32.xor $push12=, $14, $pop110
-; NO-SIMD128-NEXT:    i32.sub $push13=, $pop12, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop13
-; NO-SIMD128-NEXT:    i32.const $push19=, 12
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $13
-; NO-SIMD128-NEXT:    i32.const $push109=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push108=, $pop16, $pop109
-; NO-SIMD128-NEXT:    local.tee $push107=, $16=, $pop108
-; NO-SIMD128-NEXT:    i32.xor $push17=, $13, $pop107
-; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $12
-; NO-SIMD128-NEXT:    i32.const $push106=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push105=, $pop21, $pop106
-; NO-SIMD128-NEXT:    local.tee $push104=, $16=, $pop105
-; NO-SIMD128-NEXT:    i32.xor $push22=, $12, $pop104
-; NO-SIMD128-NEXT:    i32.sub $push23=, $pop22, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 10
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push26=, $11
-; NO-SIMD128-NEXT:    i32.const $push103=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push102=, $pop26, $pop103
-; NO-SIMD128-NEXT:    local.tee $push101=, $16=, $pop102
-; NO-SIMD128-NEXT:    i32.xor $push27=, $11, $pop101
-; NO-SIMD128-NEXT:    i32.sub $push28=, $pop27, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 9
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $10
-; NO-SIMD128-NEXT:    i32.const $push100=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push99=, $pop31, $pop100
-; NO-SIMD128-NEXT:    local.tee $push98=, $16=, $pop99
-; NO-SIMD128-NEXT:    i32.xor $push32=, $10, $pop98
-; NO-SIMD128-NEXT:    i32.sub $push33=, $pop32, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.const $push97=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push96=, $pop36, $pop97
-; NO-SIMD128-NEXT:    local.tee $push95=, $16=, $pop96
-; NO-SIMD128-NEXT:    i32.xor $push37=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.sub $push38=, $pop37, $16
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop38
-; NO-SIMD128-NEXT:    i32.const $push94=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop94
-; NO-SIMD128-NEXT:    i32.extend8_s $push39=, $8
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $15
 ; NO-SIMD128-NEXT:    i32.const $push93=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push92=, $pop39, $pop93
+; NO-SIMD128-NEXT:    i32.shr_s $push92=, $pop4, $pop93
 ; NO-SIMD128-NEXT:    local.tee $push91=, $16=, $pop92
-; NO-SIMD128-NEXT:    i32.xor $push40=, $8, $pop91
-; NO-SIMD128-NEXT:    i32.sub $push41=, $pop40, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop41
-; NO-SIMD128-NEXT:    i32.const $push46=, 6
-; NO-SIMD128-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $pop91
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $14
 ; NO-SIMD128-NEXT:    i32.const $push90=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push89=, $pop43, $pop90
+; NO-SIMD128-NEXT:    i32.shr_s $push89=, $pop7, $pop90
 ; NO-SIMD128-NEXT:    local.tee $push88=, $16=, $pop89
-; NO-SIMD128-NEXT:    i32.xor $push44=, $7, $pop88
-; NO-SIMD128-NEXT:    i32.sub $push45=, $pop44, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop47), $pop45
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $14, $pop88
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop8, $16
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $13
 ; NO-SIMD128-NEXT:    i32.const $push87=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push86=, $pop48, $pop87
+; NO-SIMD128-NEXT:    i32.shr_s $push86=, $pop10, $pop87
 ; NO-SIMD128-NEXT:    local.tee $push85=, $16=, $pop86
-; NO-SIMD128-NEXT:    i32.xor $push49=, $6, $pop85
-; NO-SIMD128-NEXT:    i32.sub $push50=, $pop49, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $13, $pop85
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop11, $16
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $12
 ; NO-SIMD128-NEXT:    i32.const $push84=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push83=, $pop53, $pop84
+; NO-SIMD128-NEXT:    i32.shr_s $push83=, $pop13, $pop84
 ; NO-SIMD128-NEXT:    local.tee $push82=, $16=, $pop83
-; NO-SIMD128-NEXT:    i32.xor $push54=, $5, $pop82
-; NO-SIMD128-NEXT:    i32.sub $push55=, $pop54, $16
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop55
-; NO-SIMD128-NEXT:    i32.const $push59=, 3
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $12, $pop82
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop14, $16
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $11
 ; NO-SIMD128-NEXT:    i32.const $push81=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push80=, $pop56, $pop81
+; NO-SIMD128-NEXT:    i32.shr_s $push80=, $pop16, $pop81
 ; NO-SIMD128-NEXT:    local.tee $push79=, $16=, $pop80
-; NO-SIMD128-NEXT:    i32.xor $push57=, $4, $pop79
-; NO-SIMD128-NEXT:    i32.sub $push58=, $pop57, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $11, $pop79
+; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $16
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $10
 ; NO-SIMD128-NEXT:    i32.const $push78=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push77=, $pop61, $pop78
+; NO-SIMD128-NEXT:    i32.shr_s $push77=, $pop19, $pop78
 ; NO-SIMD128-NEXT:    local.tee $push76=, $16=, $pop77
-; NO-SIMD128-NEXT:    i32.xor $push62=, $3, $pop76
-; NO-SIMD128-NEXT:    i32.sub $push63=, $pop62, $16
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop63
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $10, $pop76
+; NO-SIMD128-NEXT:    i32.sub $push21=, $pop20, $16
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $9
 ; NO-SIMD128-NEXT:    i32.const $push75=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push74=, $pop64, $pop75
+; NO-SIMD128-NEXT:    i32.shr_s $push74=, $pop22, $pop75
 ; NO-SIMD128-NEXT:    local.tee $push73=, $16=, $pop74
-; NO-SIMD128-NEXT:    i32.xor $push65=, $2, $pop73
-; NO-SIMD128-NEXT:    i32.sub $push66=, $pop65, $16
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop66
-; NO-SIMD128-NEXT:    i32.extend8_s $push67=, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.sub $push24=, $pop23, $16
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $8
 ; NO-SIMD128-NEXT:    i32.const $push72=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push71=, $pop67, $pop72
+; NO-SIMD128-NEXT:    i32.shr_s $push71=, $pop25, $pop72
 ; NO-SIMD128-NEXT:    local.tee $push70=, $16=, $pop71
-; NO-SIMD128-NEXT:    i32.xor $push68=, $1, $pop70
-; NO-SIMD128-NEXT:    i32.sub $push69=, $pop68, $16
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop69
+; NO-SIMD128-NEXT:    i32.xor $push26=, $8, $pop70
+; NO-SIMD128-NEXT:    i32.sub $push27=, $pop26, $16
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $7
+; NO-SIMD128-NEXT:    i32.const $push69=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push68=, $pop28, $pop69
+; NO-SIMD128-NEXT:    local.tee $push67=, $16=, $pop68
+; NO-SIMD128-NEXT:    i32.xor $push29=, $7, $pop67
+; NO-SIMD128-NEXT:    i32.sub $push30=, $pop29, $16
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $6
+; NO-SIMD128-NEXT:    i32.const $push66=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push65=, $pop31, $pop66
+; NO-SIMD128-NEXT:    local.tee $push64=, $16=, $pop65
+; NO-SIMD128-NEXT:    i32.xor $push32=, $6, $pop64
+; NO-SIMD128-NEXT:    i32.sub $push33=, $pop32, $16
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.extend8_s $push34=, $5
+; NO-SIMD128-NEXT:    i32.const $push63=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push62=, $pop34, $pop63
+; NO-SIMD128-NEXT:    local.tee $push61=, $16=, $pop62
+; NO-SIMD128-NEXT:    i32.xor $push35=, $5, $pop61
+; NO-SIMD128-NEXT:    i32.sub $push36=, $pop35, $16
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $4
+; NO-SIMD128-NEXT:    i32.const $push60=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push59=, $pop37, $pop60
+; NO-SIMD128-NEXT:    local.tee $push58=, $16=, $pop59
+; NO-SIMD128-NEXT:    i32.xor $push38=, $4, $pop58
+; NO-SIMD128-NEXT:    i32.sub $push39=, $pop38, $16
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $3
+; NO-SIMD128-NEXT:    i32.const $push57=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push56=, $pop40, $pop57
+; NO-SIMD128-NEXT:    local.tee $push55=, $16=, $pop56
+; NO-SIMD128-NEXT:    i32.xor $push41=, $3, $pop55
+; NO-SIMD128-NEXT:    i32.sub $push42=, $pop41, $16
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $2
+; NO-SIMD128-NEXT:    i32.const $push54=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push53=, $pop43, $pop54
+; NO-SIMD128-NEXT:    local.tee $push52=, $16=, $pop53
+; NO-SIMD128-NEXT:    i32.xor $push44=, $2, $pop52
+; NO-SIMD128-NEXT:    i32.sub $push45=, $pop44, $16
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $1
+; NO-SIMD128-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push50=, $pop46, $pop51
+; NO-SIMD128-NEXT:    local.tee $push49=, $16=, $pop50
+; NO-SIMD128-NEXT:    i32.xor $push47=, $1, $pop49
+; NO-SIMD128-NEXT:    i32.sub $push48=, $pop47, $16
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v16i8:
@@ -2420,138 +2002,116 @@ define <16 x i8> @abs_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push0=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push117=, $pop0, $pop1
-; NO-SIMD128-FAST-NEXT:    local.tee $push116=, $17=, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push95=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    local.tee $push94=, $17=, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop2, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push4=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push114=, $pop4, $pop115
-; NO-SIMD128-FAST-NEXT:    local.tee $push113=, $1=, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push92=, $pop4, $pop93
+; NO-SIMD128-FAST-NEXT:    local.tee $push91=, $1=, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push111=, $pop7, $pop112
-; NO-SIMD128-FAST-NEXT:    local.tee $push110=, $2=, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push89=, $pop7, $pop90
+; NO-SIMD128-FAST-NEXT:    local.tee $push88=, $2=, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop88
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop8, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push108=, $pop10, $pop109
-; NO-SIMD128-FAST-NEXT:    local.tee $push107=, $3=, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push105=, $pop15, $pop106
-; NO-SIMD128-FAST-NEXT:    local.tee $push104=, $4=, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $5, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop16, $4
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push102=, $pop18, $pop103
-; NO-SIMD128-FAST-NEXT:    local.tee $push101=, $5=, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $6, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop19, $5
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push99=, $pop23, $pop100
-; NO-SIMD128-FAST-NEXT:    local.tee $push98=, $6=, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $7, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $pop24, $6
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push95=, $pop28, $pop96
-; NO-SIMD128-FAST-NEXT:    local.tee $push94=, $7=, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $7
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $9
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push92=, $pop32, $pop93
-; NO-SIMD128-FAST-NEXT:    local.tee $push91=, $8=, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $9, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.sub $push34=, $pop33, $8
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push89=, $pop35, $pop90
-; NO-SIMD128-FAST-NEXT:    local.tee $push88=, $9=, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $10, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.sub $push37=, $pop36, $9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $11
 ; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push86=, $pop40, $pop87
-; NO-SIMD128-FAST-NEXT:    local.tee $push85=, $10=, $pop86
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $11, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.sub $push42=, $pop41, $10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push86=, $pop10, $pop87
+; NO-SIMD128-FAST-NEXT:    local.tee $push85=, $3=, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push83=, $pop45, $pop84
-; NO-SIMD128-FAST-NEXT:    local.tee $push82=, $11=, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.sub $push47=, $pop46, $11
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $13
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push83=, $pop13, $pop84
+; NO-SIMD128-FAST-NEXT:    local.tee $push82=, $4=, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $5, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop14, $4
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push80=, $pop50, $pop81
-; NO-SIMD128-FAST-NEXT:    local.tee $push79=, $12=, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.xor $push51=, $13, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.sub $push52=, $pop51, $12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop54), $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push55=, $14
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push80=, $pop16, $pop81
+; NO-SIMD128-FAST-NEXT:    local.tee $push79=, $5=, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $6, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.sub $push18=, $pop17, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
 ; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push77=, $pop55, $pop78
-; NO-SIMD128-FAST-NEXT:    local.tee $push76=, $13=, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.xor $push56=, $14, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.sub $push57=, $pop56, $13
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push77=, $pop19, $pop78
+; NO-SIMD128-FAST-NEXT:    local.tee $push76=, $6=, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $7, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.sub $push21=, $pop20, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push74=, $pop60, $pop75
-; NO-SIMD128-FAST-NEXT:    local.tee $push73=, $14=, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.xor $push61=, $15, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.sub $push62=, $pop61, $14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push69=, $0, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push65=, $16
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push74=, $pop22, $pop75
+; NO-SIMD128-FAST-NEXT:    local.tee $push73=, $7=, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $8, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.sub $push24=, $pop23, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push71=, $pop65, $pop72
-; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $0=, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.xor $push66=, $16, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.sub $push67=, $pop66, $0
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop69), $pop67
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push71=, $pop25, $pop72
+; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $8=, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $9, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.sub $push27=, $pop26, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push68=, $pop28, $pop69
+; NO-SIMD128-FAST-NEXT:    local.tee $push67=, $9=, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $10, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push65=, $pop31, $pop66
+; NO-SIMD128-FAST-NEXT:    local.tee $push64=, $10=, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $11, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.sub $push33=, $pop32, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push62=, $pop34, $pop63
+; NO-SIMD128-FAST-NEXT:    local.tee $push61=, $11=, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $12, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.sub $push36=, $pop35, $11
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push59=, $pop37, $pop60
+; NO-SIMD128-FAST-NEXT:    local.tee $push58=, $12=, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $13, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.sub $push39=, $pop38, $12
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push56=, $pop40, $pop57
+; NO-SIMD128-FAST-NEXT:    local.tee $push55=, $13=, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $14, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.sub $push42=, $pop41, $13
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push43=, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push53=, $pop43, $pop54
+; NO-SIMD128-FAST-NEXT:    local.tee $push52=, $14=, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $15, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.sub $push45=, $pop44, $14
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push46=, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop46, $pop51
+; NO-SIMD128-FAST-NEXT:    local.tee $push49=, $15=, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $16, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.sub $push48=, $pop47, $15
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> zeroinitializer, %x
   %b = icmp slt <16 x i8> %x, zeroinitializer
@@ -2576,75 +2136,53 @@ define <16 x i8> @neg_v16i8(<16 x i8> %x) {
 ; NO-SIMD128:         .functype neg_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $9
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop53, $5
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop52, $3
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop51, $2
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, 0
-; NO-SIMD128-NEXT:    i32.sub $push5=, $pop50, $1
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, 0
-; NO-SIMD128-NEXT:    i32.sub $push6=, $pop49, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, 0
-; NO-SIMD128-NEXT:    i32.sub $push9=, $pop48, $15
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, 0
-; NO-SIMD128-NEXT:    i32.sub $push12=, $pop47, $14
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, 0
-; NO-SIMD128-NEXT:    i32.sub $push15=, $pop46, $13
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, 0
-; NO-SIMD128-NEXT:    i32.sub $push18=, $pop45, $12
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, 0
-; NO-SIMD128-NEXT:    i32.sub $push21=, $pop44, $11
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, 0
-; NO-SIMD128-NEXT:    i32.sub $push24=, $pop43, $10
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, 0
-; NO-SIMD128-NEXT:    i32.sub $push27=, $pop42, $8
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, 0
-; NO-SIMD128-NEXT:    i32.sub $push30=, $pop41, $7
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push40=, 0
-; NO-SIMD128-NEXT:    i32.sub $push33=, $pop40, $6
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push39=, 0
-; NO-SIMD128-NEXT:    i32.sub $push36=, $pop39, $4
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $16
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop31, $15
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop30, $14
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop29, $13
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, 0
+; NO-SIMD128-NEXT:    i32.sub $push5=, $pop28, $12
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, 0
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop27, $11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, 0
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop26, $10
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, 0
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop25, $9
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, 0
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, 0
+; NO-SIMD128-NEXT:    i32.sub $push10=, $pop23, $7
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, 0
+; NO-SIMD128-NEXT:    i32.sub $push11=, $pop22, $6
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, 0
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop21, $5
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, 0
+; NO-SIMD128-NEXT:    i32.sub $push13=, $pop20, $4
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, 0
+; NO-SIMD128-NEXT:    i32.sub $push14=, $pop19, $3
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 0
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, 0
+; NO-SIMD128-NEXT:    i32.sub $push16=, $pop17, $1
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v16i8:
@@ -2653,73 +2191,51 @@ define <16 x i8> @neg_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop53, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop31, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop52, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop30, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop51, $4
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop50, $5
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop49, $6
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop48, $7
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop47, $8
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop46, $9
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop45, $10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push23=, $pop44, $11
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push26=, $pop43, $12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push29=, $pop42, $13
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push32=, $pop41, $14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push35=, $pop40, $15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push38=, $pop39, $16
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop29, $4
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $pop28, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop27, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop26, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop25, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop23, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push11=, $pop22, $11
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop21, $12
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop20, $13
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push14=, $pop19, $14
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop18, $15
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop17, $16
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
                       i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
@@ -2744,124 +2260,80 @@ define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128:         .functype shl_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push40=, $17, $pop0
-; NO-SIMD128-NEXT:    local.tee $push39=, $17=, $pop40
-; NO-SIMD128-NEXT:    i32.shl $push1=, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.shl $push3=, $3, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.shl $push4=, $2, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.shl $push5=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.shl $push6=, $16, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.shl $push9=, $15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.shl $push12=, $14, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.shl $push15=, $13, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.shl $push18=, $12, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.shl $push21=, $11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.shl $push24=, $10, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.shl $push27=, $8, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.shl $push30=, $7, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.shl $push33=, $6, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.shl $push36=, $4, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.and $push18=, $17, $pop0
+; NO-SIMD128-NEXT:    local.tee $push17=, $17=, $pop18
+; NO-SIMD128-NEXT:    i32.shl $push1=, $16, $pop17
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $15, $17
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $14, $17
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push4=, $13, $17
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push5=, $12, $17
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $11, $17
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.shl $push7=, $10, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.shl $push8=, $9, $17
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.shl $push9=, $8, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.shl $push10=, $7, $17
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.shl $push12=, $5, $17
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.shl $push13=, $4, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.shl $push14=, $3, $17
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.shl $push15=, $2, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v16i8:
 ; NO-SIMD128-FAST:         .functype shl_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $17, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push39=, $17=, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $17=, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.shl $push17=, $9, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $10, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.shl $push23=, $11, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $12, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.shl $push29=, $13, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.shl $push32=, $14, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shl $push35=, $15, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.shl $push38=, $16, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $9, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $10, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.shl $push11=, $11, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $12, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $13, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $14, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $15, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $16, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -2890,75 +2362,53 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
 ; NO-SIMD128:         .functype shl_const_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $pop53
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $3, $pop52
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $2, $pop51
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, 5
-; NO-SIMD128-NEXT:    i32.shl $push5=, $1, $pop50
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.shl $push6=, $16, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, 5
-; NO-SIMD128-NEXT:    i32.shl $push9=, $15, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, 5
-; NO-SIMD128-NEXT:    i32.shl $push12=, $14, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, 5
-; NO-SIMD128-NEXT:    i32.shl $push15=, $13, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, 5
-; NO-SIMD128-NEXT:    i32.shl $push18=, $12, $pop45
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, 5
-; NO-SIMD128-NEXT:    i32.shl $push21=, $11, $pop44
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, 5
-; NO-SIMD128-NEXT:    i32.shl $push24=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, 5
-; NO-SIMD128-NEXT:    i32.shl $push27=, $8, $pop42
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, 5
-; NO-SIMD128-NEXT:    i32.shl $push30=, $7, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push40=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.const $push39=, 5
-; NO-SIMD128-NEXT:    i32.shl $push33=, $6, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop33
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.const $push38=, 5
-; NO-SIMD128-NEXT:    i32.shl $push35=, $4, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.shl $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $15, $pop31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $14, $pop30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $13, $pop29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, 5
+; NO-SIMD128-NEXT:    i32.shl $push5=, $12, $pop28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, 5
+; NO-SIMD128-NEXT:    i32.shl $push7=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, 5
+; NO-SIMD128-NEXT:    i32.shl $push8=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, 5
+; NO-SIMD128-NEXT:    i32.shl $push9=, $8, $pop24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, 5
+; NO-SIMD128-NEXT:    i32.shl $push10=, $7, $pop23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, 5
+; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $pop22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-NEXT:    i32.shl $push12=, $5, $pop21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, 5
+; NO-SIMD128-NEXT:    i32.shl $push13=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, 5
+; NO-SIMD128-NEXT:    i32.shl $push14=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.shl $push15=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v16i8:
@@ -2967,73 +2417,51 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $6, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $7, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $8, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $9, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push19=, $10, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push22=, $11, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push25=, $12, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push28=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push31=, $14, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push34=, $15, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push37=, $16, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $9, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $10, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push11=, $11, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <16 x i8> %v,
     <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5,
@@ -3248,91 +2676,69 @@ define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128:         .functype shl_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop0
-; NO-SIMD128-NEXT:    i32.shl $push2=, $9, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-NEXT:    i32.and $push3=, $21, $pop69
-; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $pop3
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $19, $pop68
-; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $pop5
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $18, $pop67
-; NO-SIMD128-NEXT:    i32.shl $push8=, $2, $pop7
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-NEXT:    i32.and $push9=, $17, $pop66
-; NO-SIMD128-NEXT:    i32.shl $push10=, $1, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $32, $pop65
-; NO-SIMD128-NEXT:    i32.shl $push12=, $16, $pop11
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-NEXT:    i32.and $push15=, $31, $pop64
-; NO-SIMD128-NEXT:    i32.shl $push16=, $15, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $30, $pop63
-; NO-SIMD128-NEXT:    i32.shl $push20=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $29, $pop62
-; NO-SIMD128-NEXT:    i32.shl $push24=, $13, $pop23
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $28, $pop61
-; NO-SIMD128-NEXT:    i32.shl $push28=, $12, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop60
-; NO-SIMD128-NEXT:    i32.shl $push32=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-NEXT:    i32.and $push35=, $26, $pop59
-; NO-SIMD128-NEXT:    i32.shl $push36=, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-NEXT:    i32.and $push39=, $24, $pop58
-; NO-SIMD128-NEXT:    i32.shl $push40=, $8, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $23, $pop57
-; NO-SIMD128-NEXT:    i32.shl $push44=, $7, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $22, $pop56
-; NO-SIMD128-NEXT:    i32.shl $push48=, $6, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $20, $pop55
-; NO-SIMD128-NEXT:    i32.shl $push52=, $4, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop0
+; NO-SIMD128-NEXT:    i32.shl $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-NEXT:    i32.and $push3=, $31, $pop47
+; NO-SIMD128-NEXT:    i32.shl $push4=, $15, $pop3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $30, $pop46
+; NO-SIMD128-NEXT:    i32.shl $push6=, $14, $pop5
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $29, $pop45
+; NO-SIMD128-NEXT:    i32.shl $push8=, $13, $pop7
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-NEXT:    i32.and $push9=, $28, $pop44
+; NO-SIMD128-NEXT:    i32.shl $push10=, $12, $pop9
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $27, $pop43
+; NO-SIMD128-NEXT:    i32.shl $push12=, $11, $pop11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $26, $pop42
+; NO-SIMD128-NEXT:    i32.shl $push14=, $10, $pop13
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-NEXT:    i32.and $push15=, $25, $pop41
+; NO-SIMD128-NEXT:    i32.shl $push16=, $9, $pop15
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $24, $pop40
+; NO-SIMD128-NEXT:    i32.shl $push18=, $8, $pop17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $23, $pop39
+; NO-SIMD128-NEXT:    i32.shl $push20=, $7, $pop19
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $22, $pop38
+; NO-SIMD128-NEXT:    i32.shl $push22=, $6, $pop21
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $21, $pop37
+; NO-SIMD128-NEXT:    i32.shl $push24=, $5, $pop23
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $20, $pop36
+; NO-SIMD128-NEXT:    i32.shl $push26=, $4, $pop25
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $19, $pop35
+; NO-SIMD128-NEXT:    i32.shl $push28=, $3, $pop27
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $18, $pop34
+; NO-SIMD128-NEXT:    i32.shl $push30=, $2, $pop29
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop33
+; NO-SIMD128-NEXT:    i32.shl $push32=, $1, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v16i8:
@@ -3342,88 +2748,66 @@ define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $18, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $18, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $19, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $19, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $20, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $21, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $22, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $24, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $9, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $26, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.shl $push30=, $10, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $27, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.shl $push34=, $11, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.shl $push38=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $29, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.shl $push42=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $30, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.shl $push46=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $31, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shl $push50=, $15, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $32, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.shl $push54=, $16, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $20, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $21, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $22, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $23, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $24, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $25, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.shl $push18=, $9, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $10, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.shl $push22=, $11, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $12, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $29, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $13, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $30, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.shl $push28=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $31, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shl $push30=, $15, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $32, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shl $push32=, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -3445,79 +2829,57 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: shr_s_v16i8:
 ; NO-SIMD128:         .functype shr_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $17, $pop0
-; NO-SIMD128-NEXT:    local.tee $push55=, $17=, $pop56
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop55
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $5
+; NO-SIMD128-NEXT:    i32.and $push34=, $17, $pop0
+; NO-SIMD128-NEXT:    local.tee $push33=, $17=, $pop34
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop33
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $15
 ; NO-SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $14
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $2
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $13
 ; NO-SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $1
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $12
 ; NO-SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $16
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $11
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $15
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $10
+; NO-SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $9
 ; NO-SIMD128-NEXT:    i32.shr_s $push16=, $pop15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $14
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $8
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $7
 ; NO-SIMD128-NEXT:    i32.shr_s $push20=, $pop19, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $13
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-NEXT:    i32.shr_s $push22=, $pop21, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $5
 ; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $12
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $4
+; NO-SIMD128-NEXT:    i32.shr_s $push26=, $pop25, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $3
 ; NO-SIMD128-NEXT:    i32.shr_s $push28=, $pop27, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $2
+; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $1
 ; NO-SIMD128-NEXT:    i32.shr_s $push32=, $pop31, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.extend8_s $push35=, $10
-; NO-SIMD128-NEXT:    i32.shr_s $push36=, $pop35, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.extend8_s $push39=, $8
-; NO-SIMD128-NEXT:    i32.shr_s $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $7
-; NO-SIMD128-NEXT:    i32.shr_s $push44=, $pop43, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $6
-; NO-SIMD128-NEXT:    i32.shr_s $push48=, $pop47, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.extend8_s $push51=, $4
-; NO-SIMD128-NEXT:    i32.shr_s $push52=, $pop51, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v16i8:
@@ -3525,9 +2887,9 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push1=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $17, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push55=, $1=, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push33=, $1=, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push3=, $2
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push4=, $pop3, $1
@@ -3535,67 +2897,45 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $3
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $4
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push8=, $pop7, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $pop9, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $pop15, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $9
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $10
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $13
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push26=, $pop25, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push27=, $14
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push28=, $pop27, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $15
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $11
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push34=, $pop33, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $12
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push38=, $pop37, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $13
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push42=, $pop41, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $14
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push46=, $pop45, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $16
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push54=, $pop53, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $16
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -3811,108 +3151,86 @@ define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v16i8:
 ; NO-SIMD128:         .functype shr_s_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push2=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push2=, $16
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop0
 ; NO-SIMD128-NEXT:    i32.shr_s $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $5
-; NO-SIMD128-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $21, $pop85
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop63
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $3
-; NO-SIMD128-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $19, $pop84
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop62
 ; NO-SIMD128-NEXT:    i32.shr_s $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $2
-; NO-SIMD128-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $18, $pop83
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $13
+; NO-SIMD128-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop61
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop12
-; NO-SIMD128-NEXT:    i32.extend8_s $push14=, $1
-; NO-SIMD128-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $17, $pop82
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push14=, $12
+; NO-SIMD128-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop60
 ; NO-SIMD128-NEXT:    i32.shr_s $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 15
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $16
-; NO-SIMD128-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-NEXT:    i32.and $push16=, $32, $pop81
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $11
+; NO-SIMD128-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop59
 ; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 14
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $15
-; NO-SIMD128-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-NEXT:    i32.and $push21=, $31, $pop80
-; NO-SIMD128-NEXT:    i32.shr_s $push23=, $pop22, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $14
-; NO-SIMD128-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $30, $pop79
-; NO-SIMD128-NEXT:    i32.shr_s $push28=, $pop27, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 12
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $13
-; NO-SIMD128-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $29, $pop78
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $10
+; NO-SIMD128-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop58
+; NO-SIMD128-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $9
+; NO-SIMD128-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop57
+; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push26=, $8
+; NO-SIMD128-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $24, $pop56
+; NO-SIMD128-NEXT:    i32.shr_s $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $7
+; NO-SIMD128-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $23, $pop55
+; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $6
+; NO-SIMD128-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $22, $pop54
 ; NO-SIMD128-NEXT:    i32.shr_s $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push39=, 11
-; NO-SIMD128-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $12
-; NO-SIMD128-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-NEXT:    i32.and $push36=, $28, $pop77
-; NO-SIMD128-NEXT:    i32.shr_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $11
-; NO-SIMD128-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-NEXT:    i32.and $push41=, $27, $pop76
-; NO-SIMD128-NEXT:    i32.shr_s $push43=, $pop42, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $10
-; NO-SIMD128-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-NEXT:    i32.and $push46=, $26, $pop75
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.extend8_s $push35=, $5
+; NO-SIMD128-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $21, $pop53
+; NO-SIMD128-NEXT:    i32.shr_s $push36=, $pop35, $pop34
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.extend8_s $push38=, $4
+; NO-SIMD128-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $20, $pop52
+; NO-SIMD128-NEXT:    i32.shr_s $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $3
+; NO-SIMD128-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-NEXT:    i32.and $push40=, $19, $pop51
+; NO-SIMD128-NEXT:    i32.shr_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $2
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $18, $pop50
+; NO-SIMD128-NEXT:    i32.shr_s $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $1
+; NO-SIMD128-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $17, $pop49
 ; NO-SIMD128-NEXT:    i32.shr_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push54=, 7
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $8
-; NO-SIMD128-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $24, $pop74
-; NO-SIMD128-NEXT:    i32.shr_s $push53=, $pop52, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-NEXT:    i32.const $push59=, 6
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $7
-; NO-SIMD128-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $23, $pop73
-; NO-SIMD128-NEXT:    i32.shr_s $push58=, $pop57, $pop56
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.const $push64=, 5
-; NO-SIMD128-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-NEXT:    i32.extend8_s $push62=, $6
-; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push61=, $22, $pop72
-; NO-SIMD128-NEXT:    i32.shr_s $push63=, $pop62, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-NEXT:    i32.const $push69=, 3
-; NO-SIMD128-NEXT:    i32.add $push70=, $0, $pop69
-; NO-SIMD128-NEXT:    i32.extend8_s $push67=, $4
-; NO-SIMD128-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $20, $pop71
-; NO-SIMD128-NEXT:    i32.shr_s $push68=, $pop67, $pop66
-; NO-SIMD128-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v16i8:
@@ -3924,102 +3242,80 @@ define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop63
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push8=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop62
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $20, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $22, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push26=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push26=, $9
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop56
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push27=, $pop26, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $24, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop29), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $9
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop34, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push39=, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $26, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop39, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $27, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $26, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $27, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push33=, $pop32, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $28, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push38=, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $29, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $30, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push42=, $pop41, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $31, $pop50
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push45=, $pop44, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop42), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $28, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push54=, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $29, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop54, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push59=, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $30, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push60=, $pop59, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop57), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push64=, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $31, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $32, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop70
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push47=, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $32, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push48=, $pop47, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -4042,94 +3338,72 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128:         .functype shr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push71=, $17, $pop72
-; NO-SIMD128-NEXT:    local.tee $push70=, $17=, $pop71
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop70
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-NEXT:    i32.and $push3=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $17, $pop50
+; NO-SIMD128-NEXT:    local.tee $push48=, $17=, $pop49
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop48
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-NEXT:    i32.and $push3=, $15, $pop47
 ; NO-SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop68
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $14, $pop46
 ; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $2, $pop67
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop45
 ; NO-SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-NEXT:    i32.and $push9=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-NEXT:    i32.and $push9=, $12, $pop44
 ; NO-SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $16, $pop65
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $11, $pop43
 ; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-NEXT:    i32.and $push15=, $15, $pop64
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $10, $pop42
+; NO-SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-NEXT:    i32.and $push15=, $9, $pop41
 ; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $14, $pop63
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop39
 ; NO-SIMD128-NEXT:    i32.shr_u $push20=, $pop19, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $13, $pop62
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.shr_u $push22=, $pop21, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $5, $pop37
 ; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $12, $pop61
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.shr_u $push26=, $pop25, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $3, $pop35
 ; NO-SIMD128-NEXT:    i32.shr_u $push28=, $pop27, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $11, $pop60
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $1, $pop33
 ; NO-SIMD128-NEXT:    i32.shr_u $push32=, $pop31, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-NEXT:    i32.and $push35=, $10, $pop59
-; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-NEXT:    i32.and $push39=, $8, $pop58
-; NO-SIMD128-NEXT:    i32.shr_u $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $7, $pop57
-; NO-SIMD128-NEXT:    i32.shr_u $push44=, $pop43, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $6, $pop56
-; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $4, $pop55
-; NO-SIMD128-NEXT:    i32.shr_u $push52=, $pop51, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v16i8:
@@ -4137,93 +3411,71 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $17, $pop72
-; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $1=, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $17, $pop50
+; NO-SIMD128-FAST-NEXT:    local.tee $push48=, $1=, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop48
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push8=, $pop7, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $5, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $5, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push10=, $pop9, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $6, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $6, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $7, $pop42
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push14=, $pop13, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $7, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $9, $pop40
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $8, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $10, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $11, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push22=, $pop21, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $9, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $12, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $13, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push26=, $pop25, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $10, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $14, $pop35
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push28=, $pop27, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $11, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push32=, $pop31, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $12, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $13, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push40=, $pop39, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $14, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push44=, $pop43, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $15, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $16, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push52=, $pop51, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -4440,123 +3692,101 @@ define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128:         .functype shr_u_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop101
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $5, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $21, $pop99
-; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $3, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $19, $pop97
-; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $2, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $18, $pop95
-; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $1, $pop94
-; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $17, $pop93
-; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 15
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push17=, $16, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push16=, $32, $pop91
-; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 14
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push22=, $15, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push21=, $31, $pop89
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $14, $pop88
-; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $30, $pop87
-; NO-SIMD128-NEXT:    i32.shr_u $push28=, $pop27, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 12
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push86=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $13, $pop86
-; NO-SIMD128-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $29, $pop85
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push39=, 11
-; NO-SIMD128-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $12, $pop84
-; NO-SIMD128-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-NEXT:    i32.and $push36=, $28, $pop83
-; NO-SIMD128-NEXT:    i32.shr_u $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-NEXT:    i32.and $push42=, $11, $pop82
-; NO-SIMD128-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-NEXT:    i32.and $push41=, $27, $pop81
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $10, $pop80
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-NEXT:    i32.and $push46=, $26, $pop79
-; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push54=, 7
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop79
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-NEXT:    i32.and $push52=, $8, $pop78
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop78
 ; NO-SIMD128-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $24, $pop77
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-NEXT:    i32.const $push59=, 6
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop77
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-NEXT:    i32.and $push57=, $7, $pop76
+; NO-SIMD128-NEXT:    i32.and $push8=, $14, $pop76
 ; NO-SIMD128-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $23, $pop75
-; NO-SIMD128-NEXT:    i32.shr_u $push58=, $pop57, $pop56
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.const $push64=, 5
-; NO-SIMD128-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop75
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-NEXT:    i32.and $push62=, $6, $pop74
+; NO-SIMD128-NEXT:    i32.and $push11=, $13, $pop74
 ; NO-SIMD128-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-NEXT:    i32.and $push61=, $22, $pop73
-; NO-SIMD128-NEXT:    i32.shr_u $push63=, $pop62, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-NEXT:    i32.const $push69=, 3
-; NO-SIMD128-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop73
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push67=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.and $push14=, $12, $pop72
 ; NO-SIMD128-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $20, $pop71
-; NO-SIMD128-NEXT:    i32.shr_u $push68=, $pop67, $pop66
-; NO-SIMD128-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop71
+; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $11, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop69
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $10, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop67
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $9, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop65
+; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $8, $pop64
+; NO-SIMD128-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $24, $pop63
+; NO-SIMD128-NEXT:    i32.shr_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $7, $pop62
+; NO-SIMD128-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $23, $pop61
+; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-NEXT:    i32.and $push32=, $6, $pop60
+; NO-SIMD128-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $22, $pop59
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop31
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-NEXT:    i32.and $push35=, $5, $pop58
+; NO-SIMD128-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $21, $pop57
+; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $pop34
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $4, $pop56
+; NO-SIMD128-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $20, $pop55
+; NO-SIMD128-NEXT:    i32.shr_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $3, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-NEXT:    i32.and $push40=, $19, $pop53
+; NO-SIMD128-NEXT:    i32.shr_u $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-NEXT:    i32.and $push44=, $2, $pop52
+; NO-SIMD128-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $18, $pop51
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $1, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $17, $pop49
+; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $pop46
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v16i8:
@@ -4564,122 +3794,100 @@ define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $22, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $23, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $24, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop86
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $26, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push38=, $pop37, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $12, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $28, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
 ; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $13, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $29, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $14, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop76
 ; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $30, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $15, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop74
 ; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $31, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push67=, $16, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop72
 ; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push66=, $32, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push68=, $pop67, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $10, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $26, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $11, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $27, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $12, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $28, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $13, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $29, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $14, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $30, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push42=, $pop41, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $15, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $31, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $16, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $32, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -4701,60 +3909,38 @@ define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: and_v16i8:
 ; NO-SIMD128:         .functype and_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.and $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.and $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.and $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.and $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.and $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.and $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.and $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.and $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.and $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.and $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.and $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.and $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.and $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.and $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.and $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.and $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.and $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v16i8:
@@ -4766,54 +3952,32 @@ define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -4835,60 +3999,38 @@ define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: or_v16i8:
 ; NO-SIMD128:         .functype or_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.or $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.or $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.or $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.or $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.or $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.or $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.or $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.or $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.or $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.or $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.or $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.or $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.or $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.or $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.or $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.or $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.or $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.or $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.or $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.or $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.or $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.or $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.or $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.or $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.or $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.or $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v16i8:
@@ -4900,54 +4042,32 @@ define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.or $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.or $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.or $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.or $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.or $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.or $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.or $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.or $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.or $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -4969,60 +4089,38 @@ define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: xor_v16i8:
 ; NO-SIMD128:         .functype xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.xor $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.xor $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.xor $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.xor $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.xor $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.xor $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.xor $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.xor $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.xor $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v16i8:
@@ -5034,54 +4132,32 @@ define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -5104,75 +4180,53 @@ define <16 x i8> @not_v16i8(<16 x i8> %x) {
 ; NO-SIMD128:         .functype not_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $5, $pop53
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $3, $pop52
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $2, $pop51
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $1, $pop50
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $16, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $15, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push12=, $14, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $13, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop45
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push21=, $11, $pop44
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push24=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push27=, $8, $pop42
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $7, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push40=, -1
-; NO-SIMD128-NEXT:    i32.xor $push33=, $6, $pop40
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-NEXT:    i32.xor $push36=, $4, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $15, $pop31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $14, $pop30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $13, $pop29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $12, $pop28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $8, $pop24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $7, $pop23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $6, $pop22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push12=, $5, $pop21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push16=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v16i8:
@@ -5181,73 +4235,51 @@ define <16 x i8> @not_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $5, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $6, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $7, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $8, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $9, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $10, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $11, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $12, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $13, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $14, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $15, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $16, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $7, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $9, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $10, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $11, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <16 x i8> %x, <i8 -1, i8 -1, i8 -1, i8 -1,
                           i8 -1, i8 -1, i8 -1, i8 -1,
@@ -5274,91 +4306,69 @@ define <16 x i8> @andnot_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128:         .functype andnot_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $25, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $9, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $21, $pop69
-; NO-SIMD128-NEXT:    i32.and $push4=, $5, $pop3
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $19, $pop68
-; NO-SIMD128-NEXT:    i32.and $push6=, $3, $pop5
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $18, $pop67
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop7
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $17, $pop66
-; NO-SIMD128-NEXT:    i32.and $push10=, $1, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $32, $pop65
-; NO-SIMD128-NEXT:    i32.and $push12=, $16, $pop11
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $31, $pop64
-; NO-SIMD128-NEXT:    i32.and $push16=, $15, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $30, $pop63
-; NO-SIMD128-NEXT:    i32.and $push20=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, -1
-; NO-SIMD128-NEXT:    i32.xor $push23=, $29, $pop62
-; NO-SIMD128-NEXT:    i32.and $push24=, $13, $pop23
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, -1
-; NO-SIMD128-NEXT:    i32.xor $push27=, $28, $pop61
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, -1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $27, $pop60
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, -1
-; NO-SIMD128-NEXT:    i32.xor $push35=, $26, $pop59
-; NO-SIMD128-NEXT:    i32.and $push36=, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, -1
-; NO-SIMD128-NEXT:    i32.xor $push39=, $24, $pop58
-; NO-SIMD128-NEXT:    i32.and $push40=, $8, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, -1
-; NO-SIMD128-NEXT:    i32.xor $push43=, $23, $pop57
-; NO-SIMD128-NEXT:    i32.and $push44=, $7, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, -1
-; NO-SIMD128-NEXT:    i32.xor $push47=, $22, $pop56
-; NO-SIMD128-NEXT:    i32.and $push48=, $6, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, -1
-; NO-SIMD128-NEXT:    i32.xor $push51=, $20, $pop55
-; NO-SIMD128-NEXT:    i32.and $push52=, $4, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.xor $push1=, $32, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $pop47
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $30, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $14, $pop5
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $29, $pop45
+; NO-SIMD128-NEXT:    i32.and $push8=, $13, $pop7
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $28, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $12, $pop9
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $27, $pop43
+; NO-SIMD128-NEXT:    i32.and $push12=, $11, $pop11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $26, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $10, $pop13
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $25, $pop41
+; NO-SIMD128-NEXT:    i32.and $push16=, $9, $pop15
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $24, $pop40
+; NO-SIMD128-NEXT:    i32.and $push18=, $8, $pop17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push19=, $23, $pop39
+; NO-SIMD128-NEXT:    i32.and $push20=, $7, $pop19
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $22, $pop38
+; NO-SIMD128-NEXT:    i32.and $push22=, $6, $pop21
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $21, $pop37
+; NO-SIMD128-NEXT:    i32.and $push24=, $5, $pop23
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $20, $pop36
+; NO-SIMD128-NEXT:    i32.and $push26=, $4, $pop25
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push27=, $19, $pop35
+; NO-SIMD128-NEXT:    i32.and $push28=, $3, $pop27
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $18, $pop34
+; NO-SIMD128-NEXT:    i32.and $push30=, $2, $pop29
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push31=, $17, $pop33
+; NO-SIMD128-NEXT:    i32.and $push32=, $1, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v16i8:
@@ -5368,88 +4378,66 @@ define <16 x i8> @andnot_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $17, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $19, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $19, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $21, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $23, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $24, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $25, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $26, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $10, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $27, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $11, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $29, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $30, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $31, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $15, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push53=, $32, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $16, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $20, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $21, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $22, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $23, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $24, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $25, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $10, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $11, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $12, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $29, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $13, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $30, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $31, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $15, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $32, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <16 x i8> %y,
    <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -5477,124 +4465,102 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v16i8:
 ; NO-SIMD128:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $48
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $pop101
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $47
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop7, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $14, $pop100
-; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $46
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $pop79
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $47
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $pop78
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $46
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $pop77
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $45
 ; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-NEXT:    i32.xor $push20=, $13, $pop99
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $45
-; NO-SIMD128-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.and $push25=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-NEXT:    i32.xor $push26=, $12, $pop98
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $44
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop76
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $44
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $pop75
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $43
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $pop74
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $42
 ; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.and $push31=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-NEXT:    i32.xor $push32=, $11, $pop97
-; NO-SIMD128-NEXT:    i32.and $push33=, $pop32, $43
-; NO-SIMD128-NEXT:    i32.or $push34=, $pop31, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.and $push37=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $10, $pop96
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $41
+; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push33=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-NEXT:    i32.xor $push34=, $8, $pop72
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $40
+; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push37=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-NEXT:    i32.xor $push38=, $7, $pop71
+; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $39
 ; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.and $push43=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-NEXT:    i32.xor $push44=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.and $push45=, $pop44, $41
-; NO-SIMD128-NEXT:    i32.or $push46=, $pop43, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.and $push47=, $8, $24
-; NO-SIMD128-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-NEXT:    i32.xor $push48=, $8, $pop94
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $40
-; NO-SIMD128-NEXT:    i32.or $push50=, $pop47, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.and $push53=, $7, $23
-; NO-SIMD128-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-NEXT:    i32.xor $push54=, $7, $pop93
-; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push41=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-NEXT:    i32.xor $push42=, $6, $pop70
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $38
+; NO-SIMD128-NEXT:    i32.or $push44=, $pop41, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.and $push45=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push46=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $37
+; NO-SIMD128-NEXT:    i32.or $push48=, $pop45, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.and $push49=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push50=, $4, $pop68
+; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $36
+; NO-SIMD128-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.and $push53=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push54=, $3, $pop67
+; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $35
 ; NO-SIMD128-NEXT:    i32.or $push56=, $pop53, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.and $push59=, $6, $22
-; NO-SIMD128-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-NEXT:    i32.xor $push60=, $6, $pop92
-; NO-SIMD128-NEXT:    i32.and $push61=, $pop60, $38
-; NO-SIMD128-NEXT:    i32.or $push62=, $pop59, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.and $push65=, $5, $21
-; NO-SIMD128-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-NEXT:    i32.xor $push66=, $5, $pop91
-; NO-SIMD128-NEXT:    i32.and $push67=, $pop66, $37
-; NO-SIMD128-NEXT:    i32.or $push68=, $pop65, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-NEXT:    i32.and $push69=, $4, $20
-; NO-SIMD128-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-NEXT:    i32.xor $push70=, $4, $pop90
-; NO-SIMD128-NEXT:    i32.and $push71=, $pop70, $36
-; NO-SIMD128-NEXT:    i32.or $push72=, $pop69, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-NEXT:    i32.and $push75=, $3, $19
-; NO-SIMD128-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-NEXT:    i32.xor $push76=, $3, $pop89
-; NO-SIMD128-NEXT:    i32.and $push77=, $pop76, $35
-; NO-SIMD128-NEXT:    i32.or $push78=, $pop75, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
-; NO-SIMD128-NEXT:    i32.and $push79=, $2, $18
-; NO-SIMD128-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-NEXT:    i32.xor $push80=, $2, $pop88
-; NO-SIMD128-NEXT:    i32.and $push81=, $pop80, $34
-; NO-SIMD128-NEXT:    i32.or $push82=, $pop79, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
-; NO-SIMD128-NEXT:    i32.and $push83=, $1, $17
-; NO-SIMD128-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-NEXT:    i32.xor $push84=, $1, $pop87
-; NO-SIMD128-NEXT:    i32.and $push85=, $pop84, $33
-; NO-SIMD128-NEXT:    i32.or $push86=, $pop83, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.and $push57=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push58=, $2, $pop66
+; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $34
+; NO-SIMD128-NEXT:    i32.or $push60=, $pop57, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.and $push61=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push62=, $1, $pop65
+; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $33
+; NO-SIMD128-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v16i8:
@@ -5607,117 +4573,95 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $34
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $35
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop77
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $36
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $5, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $37
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $6, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $38
-; NO-SIMD128-FAST-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $7, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $37
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $38
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $39
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $40
 ; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $8, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $40
-; NO-SIMD128-FAST-NEXT:    i32.or $push38=, $pop35, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $9, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $9, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $41
+; NO-SIMD128-FAST-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $10, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-FAST-NEXT:    i32.or $push40=, $pop37, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $11, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $43
 ; NO-SIMD128-FAST-NEXT:    i32.or $push44=, $pop41, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $10, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $44
 ; NO-SIMD128-FAST-NEXT:    i32.or $push48=, $pop45, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $11, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $43
-; NO-SIMD128-FAST-NEXT:    i32.or $push54=, $pop51, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $12, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $13, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $45
+; NO-SIMD128-FAST-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $14, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $46
+; NO-SIMD128-FAST-NEXT:    i32.or $push56=, $pop53, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $15, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $47
 ; NO-SIMD128-FAST-NEXT:    i32.or $push60=, $pop57, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $13, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.and $push65=, $pop64, $45
-; NO-SIMD128-FAST-NEXT:    i32.or $push66=, $pop63, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push70=, $14, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $pop70, $46
-; NO-SIMD128-FAST-NEXT:    i32.or $push72=, $pop69, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push76=, $15, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $pop76, $47
-; NO-SIMD128-FAST-NEXT:    i32.or $push78=, $pop75, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push82=, $16, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.and $push83=, $pop82, $48
-; NO-SIMD128-FAST-NEXT:    i32.or $push84=, $pop81, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $16, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $48
+; NO-SIMD128-FAST-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <16 x i8> %c, %v1
   %inv_mask = xor <16 x i8> %c,
@@ -5746,92 +4690,70 @@ define <16 x i8> @bitselect_xor_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v16i8:
 ; NO-SIMD128:         .functype bitselect_xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 15
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $32, $48
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $16
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $48
-; NO-SIMD128-NEXT:    i32.store8 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push5=, $31, $47
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $15
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $47
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push10=, $30, $46
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $14
-; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $46
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 12
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push15=, $29, $45
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $13
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $45
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push20=, $28, $44
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $12
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $44
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push28=, 10
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.xor $push25=, $27, $43
-; NO-SIMD128-NEXT:    i32.and $push26=, $pop25, $11
-; NO-SIMD128-NEXT:    i32.xor $push27=, $pop26, $43
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push33=, 9
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push30=, $26, $42
-; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $10
-; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $42
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.xor $push35=, $25, $41
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $9
-; NO-SIMD128-NEXT:    i32.xor $push37=, $pop36, $41
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop37
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.xor $push38=, $24, $40
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $8
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $40
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push46=, 6
-; NO-SIMD128-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-NEXT:    i32.xor $push43=, $23, $39
-; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $7
-; NO-SIMD128-NEXT:    i32.xor $push45=, $pop44, $39
-; NO-SIMD128-NEXT:    i32.store8 0($pop47), $pop45
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.xor $push48=, $22, $38
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $6
-; NO-SIMD128-NEXT:    i32.xor $push50=, $pop49, $38
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.xor $push53=, $21, $37
-; NO-SIMD128-NEXT:    i32.and $push54=, $pop53, $5
-; NO-SIMD128-NEXT:    i32.xor $push55=, $pop54, $37
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop55
-; NO-SIMD128-NEXT:    i32.const $push59=, 3
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.xor $push56=, $20, $36
-; NO-SIMD128-NEXT:    i32.and $push57=, $pop56, $4
-; NO-SIMD128-NEXT:    i32.xor $push58=, $pop57, $36
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.xor $push61=, $19, $35
-; NO-SIMD128-NEXT:    i32.and $push62=, $pop61, $3
-; NO-SIMD128-NEXT:    i32.xor $push63=, $pop62, $35
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop63
-; NO-SIMD128-NEXT:    i32.xor $push64=, $18, $34
-; NO-SIMD128-NEXT:    i32.and $push65=, $pop64, $2
-; NO-SIMD128-NEXT:    i32.xor $push66=, $pop65, $34
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop66
-; NO-SIMD128-NEXT:    i32.xor $push67=, $17, $33
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $1
-; NO-SIMD128-NEXT:    i32.xor $push69=, $pop68, $33
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop69
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $47
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $15
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $30, $46
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $14
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $29, $45
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $13
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $28, $44
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $27, $43
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $11
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $26, $42
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $10
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $25, $41
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $9
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop23
+; NO-SIMD128-NEXT:    i32.xor $push24=, $24, $40
+; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop26
+; NO-SIMD128-NEXT:    i32.xor $push27=, $23, $39
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $7
+; NO-SIMD128-NEXT:    i32.xor $push29=, $pop28, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-NEXT:    i32.xor $push30=, $22, $38
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $6
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push33=, $21, $37
+; NO-SIMD128-NEXT:    i32.and $push34=, $pop33, $5
+; NO-SIMD128-NEXT:    i32.xor $push35=, $pop34, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push36=, $20, $36
+; NO-SIMD128-NEXT:    i32.and $push37=, $pop36, $4
+; NO-SIMD128-NEXT:    i32.xor $push38=, $pop37, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop38
+; NO-SIMD128-NEXT:    i32.xor $push39=, $19, $35
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $3
+; NO-SIMD128-NEXT:    i32.xor $push41=, $pop40, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop41
+; NO-SIMD128-NEXT:    i32.xor $push42=, $18, $34
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $2
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push45=, $17, $33
+; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $1
+; NO-SIMD128-NEXT:    i32.xor $push47=, $pop46, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop47
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v16i8:
@@ -5849,80 +4771,58 @@ define <16 x i8> @bitselect_xor_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $35
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $20, $36
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $21, $37
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $37
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $22, $38
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $pop20, $38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $23, $39
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $24, $40
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $pop29, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $pop30, $40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $25, $41
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $pop33, $41
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $26, $42
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $pop38, $42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $27, $43
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $11
-; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop41), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $28, $44
-; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $12
-; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $pop48, $44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop46), $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $29, $45
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $pop53, $45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop51), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $30, $46
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $pop57, $14
-; NO-SIMD128-FAST-NEXT:    i32.xor $push59=, $pop58, $46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $31, $47
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $pop63, $47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop64
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $0, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.xor $push67=, $32, $48
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push69=, $pop68, $48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop66), $pop69
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $36
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $36
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $pop28, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $pop34, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $pop40, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $pop45, $16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $pop46, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop47
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <16 x i8> %v1, %v2
  %and = and <16 x i8> %xor1, %c
@@ -5949,124 +4849,102 @@ define <16 x i8> @bitselect_xor_reversed_v16i8(<16 x i8> %c, <16 x i8> %v1, <16
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v16i8:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $32, $48
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $48
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push8=, $31, $47
-; NO-SIMD128-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $15, $pop101
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $47
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.xor $push14=, $30, $46
-; NO-SIMD128-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $14, $pop100
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $31, $47
+; NO-SIMD128-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $pop79
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $30, $46
+; NO-SIMD128-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $14, $pop78
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $29, $45
+; NO-SIMD128-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $13, $pop77
 ; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $46
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push20=, $29, $45
-; NO-SIMD128-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $13, $pop99
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $45
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.xor $push26=, $28, $44
-; NO-SIMD128-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-NEXT:    i32.xor $push25=, $12, $pop98
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push18=, $28, $44
+; NO-SIMD128-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $12, $pop76
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push22=, $27, $43
+; NO-SIMD128-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $11, $pop75
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push24=, $pop23, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.xor $push26=, $26, $42
+; NO-SIMD128-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $10, $pop74
 ; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $44
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.xor $push32=, $27, $43
-; NO-SIMD128-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $11, $pop97
-; NO-SIMD128-NEXT:    i32.and $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.xor $push34=, $pop33, $43
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.xor $push38=, $26, $42
-; NO-SIMD128-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-NEXT:    i32.xor $push37=, $10, $pop96
+; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.xor $push30=, $25, $41
+; NO-SIMD128-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push34=, $24, $40
+; NO-SIMD128-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-NEXT:    i32.xor $push33=, $8, $pop72
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.xor $push36=, $pop35, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.xor $push38=, $23, $39
+; NO-SIMD128-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-NEXT:    i32.xor $push37=, $7, $pop71
 ; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $42
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.xor $push44=, $25, $41
-; NO-SIMD128-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-NEXT:    i32.xor $push43=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.and $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.xor $push46=, $pop45, $41
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.xor $push48=, $24, $40
-; NO-SIMD128-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-NEXT:    i32.xor $push47=, $8, $pop94
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.xor $push50=, $pop49, $40
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.xor $push54=, $23, $39
-; NO-SIMD128-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-NEXT:    i32.xor $push53=, $7, $pop93
+; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.xor $push42=, $22, $38
+; NO-SIMD128-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-NEXT:    i32.xor $push41=, $6, $pop70
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push46=, $21, $37
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push45=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.xor $push48=, $pop47, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.xor $push50=, $20, $36
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push49=, $4, $pop68
+; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.xor $push52=, $pop51, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.xor $push54=, $19, $35
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push53=, $3, $pop67
 ; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.xor $push56=, $pop55, $39
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.xor $push60=, $22, $38
-; NO-SIMD128-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-NEXT:    i32.xor $push59=, $6, $pop92
-; NO-SIMD128-NEXT:    i32.and $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.xor $push62=, $pop61, $38
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.xor $push66=, $21, $37
-; NO-SIMD128-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-NEXT:    i32.xor $push65=, $5, $pop91
-; NO-SIMD128-NEXT:    i32.and $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.xor $push68=, $pop67, $37
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-NEXT:    i32.xor $push70=, $20, $36
-; NO-SIMD128-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-NEXT:    i32.xor $push69=, $4, $pop90
-; NO-SIMD128-NEXT:    i32.and $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.xor $push72=, $pop71, $36
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-NEXT:    i32.xor $push76=, $19, $35
-; NO-SIMD128-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-NEXT:    i32.xor $push75=, $3, $pop89
-; NO-SIMD128-NEXT:    i32.and $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.xor $push78=, $pop77, $35
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
-; NO-SIMD128-NEXT:    i32.xor $push80=, $18, $34
-; NO-SIMD128-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-NEXT:    i32.xor $push79=, $2, $pop88
-; NO-SIMD128-NEXT:    i32.and $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.xor $push82=, $pop81, $34
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
-; NO-SIMD128-NEXT:    i32.xor $push84=, $17, $33
-; NO-SIMD128-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-NEXT:    i32.xor $push83=, $1, $pop87
-; NO-SIMD128-NEXT:    i32.and $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.xor $push86=, $pop85, $33
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.xor $push56=, $pop55, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.xor $push58=, $18, $34
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push57=, $2, $pop66
+; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.xor $push60=, $pop59, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.xor $push62=, $17, $33
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push61=, $1, $pop65
+; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.xor $push64=, $pop63, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v16i8:
@@ -6079,117 +4957,95 @@ define <16 x i8> @bitselect_xor_reversed_v16i8(<16 x i8> %c, <16 x i8> %v1, <16
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $33
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $18, $34
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $34
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $19, $35
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $35
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $20, $36
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop77
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $21, $37
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $5, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $pop21, $37
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $22, $38
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $6, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $23, $39
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $7, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $5, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $6, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $pop23, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $7, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $pop27, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $24, $40
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $8, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $25, $41
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $9, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $9, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $pop35, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $10, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.xor $push40=, $pop39, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $11, $pop70
 ; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $41
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $26, $42
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $10, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $12, $pop69
 ; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.xor $push48=, $pop47, $42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $27, $43
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push51=, $11, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $pop53, $43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $28, $44
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $12, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.xor $push48=, $pop47, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $13, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $pop51, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push53=, $14, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.xor $push56=, $pop55, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $15, $pop66
 ; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.xor $push60=, $pop59, $44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $29, $45
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push63=, $13, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.and $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.xor $push66=, $pop65, $45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.xor $push70=, $30, $46
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push69=, $14, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.xor $push72=, $pop71, $46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.xor $push76=, $31, $47
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push75=, $15, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.xor $push78=, $pop77, $47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.xor $push82=, $32, $48
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push81=, $16, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.and $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.xor $push84=, $pop83, $48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.xor $push60=, $pop59, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push61=, $16, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $pop63, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <16 x i8> %v1, %v2
  %notc = xor <16 x i8> %c, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -6218,30 +5074,22 @@ define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: add_v8i16:
 ; NO-SIMD128:         .functype add_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.add $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.add $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.add $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.add $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.add $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v8i16:
@@ -6253,24 +5101,16 @@ define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6292,30 +5132,22 @@ define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: sub_v8i16:
 ; NO-SIMD128:         .functype sub_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.sub $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.sub $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.sub $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.sub $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.sub $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.sub $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.sub $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.sub $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v8i16:
@@ -6327,24 +5159,16 @@ define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6366,30 +5190,22 @@ define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: mul_v8i16:
 ; NO-SIMD128:         .functype mul_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.mul $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.mul $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.mul $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.mul $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.mul $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.mul $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.mul $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.mul $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.mul $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v8i16:
@@ -6401,24 +5217,16 @@ define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.mul $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6440,54 +5248,46 @@ define <8 x i16> @min_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: min_s_v8i16:
 ; NO-SIMD128:         .functype min_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.lt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $16, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.lt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $15, $pop8
-; NO-SIMD128-NEXT:    i32.store16 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 10
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $15, $pop6
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.lt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $14, $pop10
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $13
 ; NO-SIMD128-NEXT:    i32.lt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $6, $14, $pop14
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $12
-; NO-SIMD128-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.select $push25=, $4, $12, $pop24
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $11
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $13, $pop14
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $12
+; NO-SIMD128-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $4, $12, $pop18
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $11
+; NO-SIMD128-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $3, $11, $pop22
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend16_s $push25=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $10
+; NO-SIMD128-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-NEXT:    i32.select $push27=, $2, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $9
 ; NO-SIMD128-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.select $push31=, $3, $11, $pop30
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push33=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push32=, $10
-; NO-SIMD128-NEXT:    i32.lt_s $push34=, $pop33, $pop32
-; NO-SIMD128-NEXT:    i32.select $push35=, $2, $10, $pop34
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop35
-; NO-SIMD128-NEXT:    i32.extend16_s $push37=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.lt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $1, $9, $pop38
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop39
+; NO-SIMD128-NEXT:    i32.select $push31=, $1, $9, $pop30
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v8i16:
@@ -6508,39 +5308,31 @@ define <8 x i16> @min_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $11, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $12, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $14
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $14, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $13
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $13, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $14
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $14, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push24=, $15
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $16
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $15, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push34=, $16
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $16, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $16, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop31
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6563,70 +5355,62 @@ define <8 x i16> @min_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: min_u_v8i16:
 ; NO-SIMD128:         .functype min_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
-; NO-SIMD128-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop55
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop47
 ; NO-SIMD128-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $8, $16, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop54
-; NO-SIMD128-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop53
-; NO-SIMD128-NEXT:    i32.lt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $7, $15, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $6, $pop52
-; NO-SIMD128-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop51
-; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $6, $14, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-NEXT:    i32.and $push20=, $5, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $pop49
-; NO-SIMD128-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $4, $pop48
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop47
-; NO-SIMD128-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.select $push26=, $4, $12, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push30=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $7, $pop46
 ; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.select $push32=, $3, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop45
+; NO-SIMD128-NEXT:    i32.lt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $7, $15, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push34=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $pop44
 ; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.lt_u $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.select $push36=, $2, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $pop43
+; NO-SIMD128-NEXT:    i32.lt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $6, $14, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $5, $pop42
 ; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $1, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $pop41
+; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $5, $13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $4, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $pop39
+; NO-SIMD128-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $4, $12, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $3, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $pop37
+; NO-SIMD128-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $3, $11, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-NEXT:    i32.and $push26=, $2, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $pop35
+; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $2, $10, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push30=, $1, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $pop33
+; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $1, $9, $pop31
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v8i16:
@@ -6634,68 +5418,60 @@ define <8 x i16> @min_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $9, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $10, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $11, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop41
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $12, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $14, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $13, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $14, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $15, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $16, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6718,54 +5494,46 @@ define <8 x i16> @max_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: max_s_v8i16:
 ; NO-SIMD128:         .functype max_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.gt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $16, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.gt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $15, $pop8
-; NO-SIMD128-NEXT:    i32.store16 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 10
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $15, $pop6
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.gt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $14, $pop10
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $13
 ; NO-SIMD128-NEXT:    i32.gt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $6, $14, $pop14
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $12
-; NO-SIMD128-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.select $push25=, $4, $12, $pop24
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $11
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $13, $pop14
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $12
+; NO-SIMD128-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $4, $12, $pop18
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $11
+; NO-SIMD128-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $3, $11, $pop22
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend16_s $push25=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $10
+; NO-SIMD128-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-NEXT:    i32.select $push27=, $2, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $9
 ; NO-SIMD128-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.select $push31=, $3, $11, $pop30
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push33=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push32=, $10
-; NO-SIMD128-NEXT:    i32.gt_s $push34=, $pop33, $pop32
-; NO-SIMD128-NEXT:    i32.select $push35=, $2, $10, $pop34
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop35
-; NO-SIMD128-NEXT:    i32.extend16_s $push37=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.gt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $1, $9, $pop38
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop39
+; NO-SIMD128-NEXT:    i32.select $push31=, $1, $9, $pop30
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v8i16:
@@ -6786,39 +5554,31 @@ define <8 x i16> @max_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $11, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $12, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $14
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $14, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $13
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $13, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $14
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $14, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push24=, $15
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $16
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $15, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push34=, $16
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $16, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $16, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop31
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6841,70 +5601,62 @@ define <8 x i16> @max_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: max_u_v8i16:
 ; NO-SIMD128:         .functype max_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
-; NO-SIMD128-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop55
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop47
 ; NO-SIMD128-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $8, $16, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop54
-; NO-SIMD128-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop53
-; NO-SIMD128-NEXT:    i32.gt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $7, $15, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $6, $pop52
-; NO-SIMD128-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop51
-; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $6, $14, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-NEXT:    i32.and $push20=, $5, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $pop49
-; NO-SIMD128-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $4, $pop48
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop47
-; NO-SIMD128-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.select $push26=, $4, $12, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push30=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $7, $pop46
 ; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.select $push32=, $3, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop45
+; NO-SIMD128-NEXT:    i32.gt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $7, $15, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push34=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $pop44
 ; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.gt_u $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.select $push36=, $2, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $pop43
+; NO-SIMD128-NEXT:    i32.gt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $6, $14, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $5, $pop42
 ; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $1, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $pop41
+; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $5, $13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $4, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $pop39
+; NO-SIMD128-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $4, $12, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $3, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $pop37
+; NO-SIMD128-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $3, $11, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-NEXT:    i32.and $push26=, $2, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $pop35
+; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $2, $10, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push30=, $1, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $pop33
+; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $1, $9, $pop31
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v8i16:
@@ -6912,68 +5664,60 @@ define <8 x i16> @max_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $9, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $10, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $11, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop41
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $12, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $14, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $13, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $14, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $15, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $16, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6996,78 +5740,70 @@ define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v8i16:
 ; NO-SIMD128:         .functype avgr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $8, $16
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 65534
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop63
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $7, $15
-; NO-SIMD128-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop62
-; NO-SIMD128-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop61
-; NO-SIMD128-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop60
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $6, $14
-; NO-SIMD128-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop59
-; NO-SIMD128-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop58
-; NO-SIMD128-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop57
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-NEXT:    i32.add $push21=, $pop20, $pop56
-; NO-SIMD128-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $pop55
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 65534
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop55
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $7, $15
 ; NO-SIMD128-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop54
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-NEXT:    i32.const $push24=, 6
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push26=, $4, $12
-; NO-SIMD128-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop53
-; NO-SIMD128-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop52
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop52
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
 ; NO-SIMD128-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop51
-; NO-SIMD128-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-NEXT:    i32.add $push30=, $3, $11
-; NO-SIMD128-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop49
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $5, $13
 ; NO-SIMD128-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop48
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop33
-; NO-SIMD128-NEXT:    i32.add $push34=, $2, $10
-; NO-SIMD128-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
-; NO-SIMD128-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $4, $12
 ; NO-SIMD128-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop45
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop37
-; NO-SIMD128-NEXT:    i32.add $push38=, $1, $9
-; NO-SIMD128-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $3, $11
 ; NO-SIMD128-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $2, $10
+; NO-SIMD128-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $1, $9
+; NO-SIMD128-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop33
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v8i16:
@@ -7078,73 +5814,65 @@ define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 65534
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop55
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $13
 ; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $14
 ; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop33
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add nuw <8 x i16> %x, %y
   %b = add nuw <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -7176,78 +5904,70 @@ define <8 x i16> @avgr_u_v8i16_wrap(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v8i16_wrap:
 ; NO-SIMD128:         .functype avgr_u_v8i16_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $8, $16
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 65534
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop63
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $7, $15
-; NO-SIMD128-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop62
-; NO-SIMD128-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop61
-; NO-SIMD128-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop60
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $6, $14
-; NO-SIMD128-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop59
-; NO-SIMD128-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop58
-; NO-SIMD128-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop57
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-NEXT:    i32.add $push21=, $pop20, $pop56
-; NO-SIMD128-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $pop55
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 65534
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop55
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $7, $15
 ; NO-SIMD128-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop54
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-NEXT:    i32.const $push24=, 6
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push26=, $4, $12
-; NO-SIMD128-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop53
-; NO-SIMD128-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop52
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop52
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
 ; NO-SIMD128-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop51
-; NO-SIMD128-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-NEXT:    i32.add $push30=, $3, $11
-; NO-SIMD128-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop49
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $5, $13
 ; NO-SIMD128-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop48
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop33
-; NO-SIMD128-NEXT:    i32.add $push34=, $2, $10
-; NO-SIMD128-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
-; NO-SIMD128-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $4, $12
 ; NO-SIMD128-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop45
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop37
-; NO-SIMD128-NEXT:    i32.add $push38=, $1, $9
-; NO-SIMD128-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $3, $11
 ; NO-SIMD128-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $2, $10
+; NO-SIMD128-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $1, $9
+; NO-SIMD128-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop33
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v8i16_wrap:
@@ -7258,73 +5978,65 @@ define <8 x i16> @avgr_u_v8i16_wrap(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 65534
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop55
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $13
 ; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $14
 ; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop33
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <8 x i16> %x, %y
   %b = add <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -7348,70 +6060,62 @@ define <8 x i16> @abs_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-LABEL: abs_v8i16:
 ; NO-SIMD128:         .functype abs_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $8
 ; NO-SIMD128-NEXT:    i32.const $push1=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push55=, $pop0, $pop1
-; NO-SIMD128-NEXT:    local.tee $push54=, $9=, $pop55
-; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop54
+; NO-SIMD128-NEXT:    i32.shr_s $push47=, $pop0, $pop1
+; NO-SIMD128-NEXT:    local.tee $push46=, $9=, $pop47
+; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop46
 ; NO-SIMD128-NEXT:    i32.sub $push3=, $pop2, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $7
-; NO-SIMD128-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push52=, $pop6, $pop53
-; NO-SIMD128-NEXT:    local.tee $push51=, $8=, $pop52
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $pop51
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $6
-; NO-SIMD128-NEXT:    i32.const $push50=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push49=, $pop11, $pop50
-; NO-SIMD128-NEXT:    local.tee $push48=, $8=, $pop49
-; NO-SIMD128-NEXT:    i32.xor $push12=, $6, $pop48
-; NO-SIMD128-NEXT:    i32.sub $push13=, $pop12, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
-; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $5
-; NO-SIMD128-NEXT:    i32.const $push47=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push46=, $pop16, $pop47
-; NO-SIMD128-NEXT:    local.tee $push45=, $8=, $pop46
-; NO-SIMD128-NEXT:    i32.xor $push17=, $5, $pop45
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.const $push45=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push44=, $pop4, $pop45
+; NO-SIMD128-NEXT:    local.tee $push43=, $8=, $pop44
+; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $pop43
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.const $push42=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push41=, $pop7, $pop42
+; NO-SIMD128-NEXT:    local.tee $push40=, $8=, $pop41
+; NO-SIMD128-NEXT:    i32.xor $push8=, $6, $pop40
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop8, $8
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.const $push39=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push38=, $pop10, $pop39
+; NO-SIMD128-NEXT:    local.tee $push37=, $8=, $pop38
+; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $pop37
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop11, $8
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $4
+; NO-SIMD128-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push35=, $pop13, $pop36
+; NO-SIMD128-NEXT:    local.tee $push34=, $8=, $pop35
+; NO-SIMD128-NEXT:    i32.xor $push14=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop14, $8
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $3
+; NO-SIMD128-NEXT:    i32.const $push33=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push32=, $pop16, $pop33
+; NO-SIMD128-NEXT:    local.tee $push31=, $8=, $pop32
+; NO-SIMD128-NEXT:    i32.xor $push17=, $3, $pop31
 ; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $8
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 6
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $4
-; NO-SIMD128-NEXT:    i32.const $push44=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push43=, $pop19, $pop44
-; NO-SIMD128-NEXT:    local.tee $push42=, $8=, $pop43
-; NO-SIMD128-NEXT:    i32.xor $push20=, $4, $pop42
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $2
+; NO-SIMD128-NEXT:    i32.const $push30=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push29=, $pop19, $pop30
+; NO-SIMD128-NEXT:    local.tee $push28=, $8=, $pop29
+; NO-SIMD128-NEXT:    i32.xor $push20=, $2, $pop28
 ; NO-SIMD128-NEXT:    i32.sub $push21=, $pop20, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $3
-; NO-SIMD128-NEXT:    i32.const $push41=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push40=, $pop24, $pop41
-; NO-SIMD128-NEXT:    local.tee $push39=, $8=, $pop40
-; NO-SIMD128-NEXT:    i32.xor $push25=, $3, $pop39
-; NO-SIMD128-NEXT:    i32.sub $push26=, $pop25, $8
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push27=, $2
-; NO-SIMD128-NEXT:    i32.const $push38=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push37=, $pop27, $pop38
-; NO-SIMD128-NEXT:    local.tee $push36=, $8=, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push28=, $2, $pop36
-; NO-SIMD128-NEXT:    i32.sub $push29=, $pop28, $8
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
-; NO-SIMD128-NEXT:    i32.extend16_s $push30=, $1
-; NO-SIMD128-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push34=, $pop30, $pop35
-; NO-SIMD128-NEXT:    local.tee $push33=, $8=, $pop34
-; NO-SIMD128-NEXT:    i32.xor $push31=, $1, $pop33
-; NO-SIMD128-NEXT:    i32.sub $push32=, $pop31, $8
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $1
+; NO-SIMD128-NEXT:    i32.const $push27=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push26=, $pop22, $pop27
+; NO-SIMD128-NEXT:    local.tee $push25=, $8=, $pop26
+; NO-SIMD128-NEXT:    i32.xor $push23=, $1, $pop25
+; NO-SIMD128-NEXT:    i32.sub $push24=, $pop23, $8
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v8i16:
@@ -7419,68 +6123,60 @@ define <8 x i16> @abs_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push0=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop0, $pop1
-; NO-SIMD128-FAST-NEXT:    local.tee $push54=, $9=, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push47=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    local.tee $push46=, $9=, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop2, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push4=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push52=, $pop4, $pop53
-; NO-SIMD128-FAST-NEXT:    local.tee $push51=, $1=, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push44=, $pop4, $pop45
+; NO-SIMD128-FAST-NEXT:    local.tee $push43=, $1=, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push49=, $pop7, $pop50
-; NO-SIMD128-FAST-NEXT:    local.tee $push48=, $2=, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push41=, $pop7, $pop42
+; NO-SIMD128-FAST-NEXT:    local.tee $push40=, $2=, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop40
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop8, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push46=, $pop10, $pop47
-; NO-SIMD128-FAST-NEXT:    local.tee $push45=, $3=, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push38=, $pop10, $pop39
+; NO-SIMD128-FAST-NEXT:    local.tee $push37=, $3=, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push43=, $pop15, $pop44
-; NO-SIMD128-FAST-NEXT:    local.tee $push42=, $4=, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $5, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop16, $4
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop18, $pop41
-; NO-SIMD128-FAST-NEXT:    local.tee $push39=, $5=, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $6, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop19, $5
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push37=, $pop23, $pop38
-; NO-SIMD128-FAST-NEXT:    local.tee $push36=, $6=, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $7, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $pop24, $6
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push34=, $pop28, $pop35
-; NO-SIMD128-FAST-NEXT:    local.tee $push33=, $0=, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $0
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop13, $pop36
+; NO-SIMD128-FAST-NEXT:    local.tee $push34=, $4=, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $5, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop14, $4
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop16, $pop33
+; NO-SIMD128-FAST-NEXT:    local.tee $push31=, $5=, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $6, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.sub $push18=, $pop17, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push29=, $pop19, $pop30
+; NO-SIMD128-FAST-NEXT:    local.tee $push28=, $6=, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.sub $push21=, $pop20, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push26=, $pop22, $pop27
+; NO-SIMD128-FAST-NEXT:    local.tee $push25=, $7=, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.sub $push24=, $pop23, $7
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> zeroinitializer, %x
   %b = icmp slt <8 x i16> %x, zeroinitializer
@@ -7505,37 +6201,29 @@ define <8 x i16> @neg_v8i16(<8 x i16> %x) {
 ; NO-SIMD128:         .functype neg_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $5
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop23, $3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop22, $2
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop21, $1
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, 0
-; NO-SIMD128-NEXT:    i32.sub $push5=, $pop20, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, 0
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop19, $7
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 0
-; NO-SIMD128-NEXT:    i32.sub $push11=, $pop18, $6
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, 0
-; NO-SIMD128-NEXT:    i32.sub $push14=, $pop17, $4
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $8
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop15, $7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop14, $6
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop13, $5
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, 0
+; NO-SIMD128-NEXT:    i32.sub $push5=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, 0
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop11, $3
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 0
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop10, $2
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, 0
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v8i16:
@@ -7544,35 +6232,27 @@ define <8 x i16> @neg_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop23, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop15, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop22, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop14, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop21, $4
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop20, $5
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop18, $7
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop17, $8
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop13, $4
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop11, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop10, $7
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop9, $8
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
                      %x
@@ -7596,64 +6276,48 @@ define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128:         .functype shl_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $9, $pop0
-; NO-SIMD128-NEXT:    local.tee $push17=, $9=, $pop18
-; NO-SIMD128-NEXT:    i32.shl $push1=, $5, $pop17
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.shl $push5=, $8, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.shl $push8=, $7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.shl $push14=, $4, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop0
+; NO-SIMD128-NEXT:    local.tee $push9=, $9=, $pop10
+; NO-SIMD128-NEXT:    i32.shl $push1=, $8, $pop9
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $7, $9
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $6, $9
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $9
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push5=, $4, $9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.shl $push7=, $2, $9
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v8i16:
 ; NO-SIMD128-FAST:         .functype shl_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $9=, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push9=, $9=, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -7681,37 +6345,29 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
 ; NO-SIMD128:         .functype shl_const_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $pop22
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, 5
-; NO-SIMD128-NEXT:    i32.shl $push5=, $8, $pop20
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, 5
-; NO-SIMD128-NEXT:    i32.shl $push8=, $7, $pop19
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-NEXT:    i32.shl $push14=, $4, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.shl $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $7, $pop15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $6, $pop14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $pop13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, 5
+; NO-SIMD128-NEXT:    i32.shl $push5=, $4, $pop12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, 5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 5
+; NO-SIMD128-NEXT:    i32.shl $push7=, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, 5
+; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v8i16:
@@ -7720,35 +6376,27 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <8 x i16> %v,
     <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
@@ -7866,45 +6514,37 @@ define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128:         .functype shl_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-NEXT:    i32.and $push3=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.shl $push4=, $3, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $10, $pop30
-; NO-SIMD128-NEXT:    i32.shl $push6=, $2, $pop5
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $9, $pop29
-; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $pop7
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-NEXT:    i32.and $push9=, $16, $pop28
-; NO-SIMD128-NEXT:    i32.shl $push10=, $8, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $15, $pop27
-; NO-SIMD128-NEXT:    i32.shl $push14=, $7, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-NEXT:    i32.and $push17=, $14, $pop26
-; NO-SIMD128-NEXT:    i32.shl $push18=, $6, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-NEXT:    i32.and $push21=, $12, $pop25
-; NO-SIMD128-NEXT:    i32.shl $push22=, $4, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.shl $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-NEXT:    i32.and $push3=, $15, $pop23
+; NO-SIMD128-NEXT:    i32.shl $push4=, $7, $pop3
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $14, $pop22
+; NO-SIMD128-NEXT:    i32.shl $push6=, $6, $pop5
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop21
+; NO-SIMD128-NEXT:    i32.shl $push8=, $5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-NEXT:    i32.and $push9=, $12, $pop20
+; NO-SIMD128-NEXT:    i32.shl $push10=, $4, $pop9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $11, $pop19
+; NO-SIMD128-NEXT:    i32.shl $push12=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $10, $pop18
+; NO-SIMD128-NEXT:    i32.shl $push14=, $2, $pop13
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-NEXT:    i32.and $push15=, $9, $pop17
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v8i16:
@@ -7914,42 +6554,34 @@ define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $10, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $10, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $11, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $11, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $12, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $13, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $14, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $16, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -7971,41 +6603,33 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: shr_s_v8i16:
 ; NO-SIMD128:         .functype shr_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push26=, $9, $pop0
-; NO-SIMD128-NEXT:    local.tee $push25=, $9=, $pop26
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop25
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $3
+; NO-SIMD128-NEXT:    i32.and $push18=, $9, $pop0
+; NO-SIMD128-NEXT:    local.tee $push17=, $9=, $pop18
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop17
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $7
 ; NO-SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $2
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $6
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $1
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $5
 ; NO-SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $8
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $4
 ; NO-SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $7
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $3
+; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $2
 ; NO-SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $6
-; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $4
-; NO-SIMD128-NEXT:    i32.shr_s $push22=, $pop21, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend16_s $push15=, $1
+; NO-SIMD128-NEXT:    i32.shr_s $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v8i16:
@@ -8013,9 +6637,9 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push1=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push25=, $1=, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $1=, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push3=, $2
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push4=, $pop3, $1
@@ -8023,29 +6647,21 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $3
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $4
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push8=, $pop7, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $pop9, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $pop15, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $8
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -8164,54 +6780,46 @@ define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v8i16:
 ; NO-SIMD128:         .functype shr_s_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push2=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push2=, $8
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.shr_s $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $3
-; NO-SIMD128-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop39
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop31
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $2
-; NO-SIMD128-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop38
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $6
+; NO-SIMD128-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop30
 ; NO-SIMD128-NEXT:    i32.shr_s $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $1
-; NO-SIMD128-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop37
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $5
+; NO-SIMD128-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop29
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push14=, $8
-; NO-SIMD128-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $16, $pop36
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push14=, $4
+; NO-SIMD128-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $12, $pop28
 ; NO-SIMD128-NEXT:    i32.shr_s $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $7
-; NO-SIMD128-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $15, $pop35
-; NO-SIMD128-NEXT:    i32.shr_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $6
-; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $14, $pop34
-; NO-SIMD128-NEXT:    i32.shr_s $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $4
-; NO-SIMD128-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop33
-; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $3
+; NO-SIMD128-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-NEXT:    i32.and $push16=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $2
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $1
+; NO-SIMD128-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v8i16:
@@ -8223,48 +6831,40 @@ define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push8=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $13, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $14, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop19), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push26=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push27=, $pop26, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop24), $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push31=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $16, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop29), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push14=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $13, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $16, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -8287,48 +6887,40 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128:         .functype shr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $9, $pop34
-; NO-SIMD128-NEXT:    local.tee $push32=, $9=, $pop33
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop32
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-NEXT:    i32.and $push3=, $3, $pop31
+; NO-SIMD128-NEXT:    i32.and $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $9, $pop26
+; NO-SIMD128-NEXT:    local.tee $push24=, $9=, $pop25
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop24
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-NEXT:    i32.and $push3=, $7, $pop23
 ; NO-SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $2, $pop30
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $6, $pop22
 ; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $1, $pop29
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $5, $pop21
 ; NO-SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-NEXT:    i32.and $push9=, $8, $pop28
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-NEXT:    i32.and $push9=, $4, $pop20
 ; NO-SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $7, $pop27
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $2, $pop18
 ; NO-SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-NEXT:    i32.and $push17=, $6, $pop26
-; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-NEXT:    i32.and $push21=, $4, $pop25
-; NO-SIMD128-NEXT:    i32.shr_u $push22=, $pop21, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-NEXT:    i32.and $push15=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v8i16:
@@ -8336,47 +6928,39 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $pop34
-; NO-SIMD128-FAST-NEXT:    local.tee $push32=, $1=, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $9, $pop26
+; NO-SIMD128-FAST-NEXT:    local.tee $push24=, $1=, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop24
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push8=, $pop7, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $5, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push10=, $pop9, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $6, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $7, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push14=, $pop13, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $7, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $8, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push22=, $pop21, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -8496,61 +7080,53 @@ define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128:         .functype shr_u_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop47
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $16, $pop39
-; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop39
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $15, $pop37
-; NO-SIMD128-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop37
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $6, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $14, $pop35
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop35
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop33
-; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop33
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 65535
+; NO-SIMD128-NEXT:    i32.and $push14=, $4, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $12, $pop31
+; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $3, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-NEXT:    i32.and $push16=, $11, $pop29
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-NEXT:    i32.and $push20=, $2, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $10, $pop27
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push23=, $1, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v8i16:
@@ -8558,60 +7134,52 @@ define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $13, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $14, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $15, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $16, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $13, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $14, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $16, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -8633,30 +7201,22 @@ define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: and_v8i16:
 ; NO-SIMD128:         .functype and_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.and $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.and $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.and $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.and $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.and $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.and $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.and $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v8i16:
@@ -8668,24 +7228,16 @@ define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8707,30 +7259,22 @@ define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: or_v8i16:
 ; NO-SIMD128:         .functype or_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.or $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.or $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.or $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.or $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.or $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.or $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.or $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.or $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.or $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.or $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v8i16:
@@ -8742,24 +7286,16 @@ define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8781,30 +7317,22 @@ define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: xor_v8i16:
 ; NO-SIMD128:         .functype xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.xor $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.xor $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.xor $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v8i16:
@@ -8816,24 +7344,16 @@ define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8856,37 +7376,29 @@ define <8 x i16> @not_v8i16(<8 x i16> %x) {
 ; NO-SIMD128:         .functype not_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $pop22
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $8, $pop20
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $pop19
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $6, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $4, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $7, $pop15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $6, $pop14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $5, $pop13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $4, $pop12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $1, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v8i16:
@@ -8895,35 +7407,27 @@ define <8 x i16> @not_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $5, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $6, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $7, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $8, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $5, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $7, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1,
                           i16 -1, i16 -1, i16 -1, i16 -1>
@@ -8948,45 +7452,37 @@ define <8 x i16> @andnot_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128:         .functype andnot_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.and $push4=, $3, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $10, $pop30
-; NO-SIMD128-NEXT:    i32.and $push6=, $2, $pop5
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $9, $pop29
-; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop7
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $16, $pop28
-; NO-SIMD128-NEXT:    i32.and $push10=, $8, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $15, $pop27
-; NO-SIMD128-NEXT:    i32.and $push14=, $7, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, -1
-; NO-SIMD128-NEXT:    i32.xor $push17=, $14, $pop26
-; NO-SIMD128-NEXT:    i32.and $push18=, $6, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, -1
-; NO-SIMD128-NEXT:    i32.xor $push21=, $12, $pop25
-; NO-SIMD128-NEXT:    i32.and $push22=, $4, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $pop23
+; NO-SIMD128-NEXT:    i32.and $push4=, $7, $pop3
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $14, $pop22
+; NO-SIMD128-NEXT:    i32.and $push6=, $6, $pop5
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $13, $pop21
+; NO-SIMD128-NEXT:    i32.and $push8=, $5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $12, $pop20
+; NO-SIMD128-NEXT:    i32.and $push10=, $4, $pop9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $11, $pop19
+; NO-SIMD128-NEXT:    i32.and $push12=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $10, $pop18
+; NO-SIMD128-NEXT:    i32.and $push14=, $2, $pop13
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $9, $pop17
+; NO-SIMD128-NEXT:    i32.and $push16=, $1, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v8i16:
@@ -8996,42 +7492,34 @@ define <8 x i16> @andnot_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $9, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $11, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $11, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $13, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $15, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $16, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <8 x i16> %y,
    <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -9058,62 +7546,54 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v8i16:
 ; NO-SIMD128:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.and $push0=, $16, $8
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $24, $pop2
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $7
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $pop47
-; NO-SIMD128-NEXT:    i32.and $push9=, $23, $pop8
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop7, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $6
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $6, $pop46
-; NO-SIMD128-NEXT:    i32.and $push15=, $22, $pop14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $7
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $pop39
+; NO-SIMD128-NEXT:    i32.and $push7=, $23, $pop6
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $6
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.and $push11=, $22, $pop10
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $5
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $pop37
+; NO-SIMD128-NEXT:    i32.and $push15=, $21, $pop14
 ; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $5
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push20=, $5, $pop45
-; NO-SIMD128-NEXT:    i32.and $push21=, $21, $pop20
-; NO-SIMD128-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $4
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push24=, $4, $pop44
-; NO-SIMD128-NEXT:    i32.and $push25=, $20, $pop24
-; NO-SIMD128-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $3
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $3, $pop43
-; NO-SIMD128-NEXT:    i32.and $push31=, $19, $pop30
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $4
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.and $push19=, $20, $pop18
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $3
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $3, $pop35
+; NO-SIMD128-NEXT:    i32.and $push23=, $19, $pop22
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $2
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.and $push27=, $18, $pop26
+; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $1
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $1, $pop33
+; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop30
 ; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $2
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push34=, $2, $pop42
-; NO-SIMD128-NEXT:    i32.and $push35=, $18, $pop34
-; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $1
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $1, $pop41
-; NO-SIMD128-NEXT:    i32.and $push39=, $17, $pop38
-; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v8i16:
@@ -9126,55 +7606,47 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $20, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $5, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $6, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $7, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $23, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $24, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $8, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.or $push38=, $pop35, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <8 x i16> %v1, %c
   %inv_mask = xor <8 x i16>
@@ -9203,46 +7675,38 @@ define <8 x i16> @bitselect_xor_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v8i16:
 ; NO-SIMD128:         .functype bitselect_xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 14
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $24
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $8
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $24
-; NO-SIMD128-NEXT:    i32.store16 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $23
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $7
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $23
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $22
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $6
-; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $22
-; NO-SIMD128-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.xor $push15=, $13, $21
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $5
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 6
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $20
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $4
-; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $20
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.xor $push23=, $11, $19
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $3
-; NO-SIMD128-NEXT:    i32.xor $push25=, $pop24, $19
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
-; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $18
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $2
-; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $18
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
-; NO-SIMD128-NEXT:    i32.xor $push29=, $9, $17
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $pop30, $17
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $23
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $14, $22
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $13, $21
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $12, $20
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $11, $19
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $10, $18
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $9, $17
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v8i16:
@@ -9260,34 +7724,26 @@ define <8 x i16> @bitselect_xor_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $12, $20
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $13, $21
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $14, $22
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $pop20, $22
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $15, $23
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $16, $24
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $pop29, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $pop30, $24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $20
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <8 x i16> %v1, %v2
  %and = and <8 x i16> %xor1, %c
@@ -9314,62 +7770,54 @@ define <8 x i16> @bitselect_xor_reversed_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v8i16:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $24
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $24
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $23
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $pop47
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $23
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.xor $push14=, $14, $22
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $6, $pop46
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $23
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $pop39
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $22
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $21
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $5, $pop37
 ; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $22
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.xor $push20=, $13, $21
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $5, $pop45
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.xor $push24=, $12, $20
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push23=, $4, $pop44
-; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $20
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.xor $push30=, $11, $19
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push29=, $3, $pop43
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $20
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $19
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $3, $pop35
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push24=, $pop23, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $18
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $17
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $1, $pop33
 ; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $19
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
-; NO-SIMD128-NEXT:    i32.xor $push34=, $10, $18
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push33=, $2, $pop42
-; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push36=, $pop35, $18
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
-; NO-SIMD128-NEXT:    i32.xor $push38=, $9, $17
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push37=, $1, $pop41
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v8i16:
@@ -9382,55 +7830,47 @@ define <8 x i16> @bitselect_xor_reversed_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $10, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $18
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $11, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $12, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $13, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $5, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $pop21, $21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $14, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $6, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $22
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $15, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $7, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $5, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $6, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $pop23, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $7, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $pop27, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $16, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $8, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <8 x i16> %v1, %v2
  %notc = xor <8 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1,
@@ -9458,46 +7898,38 @@ define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: extmul_low_s_v8i16:
 ; NO-SIMD128:         .functype extmul_low_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $21
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $24
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $19
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $23
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $18
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $22
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $17
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $21
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 14
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $24
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $23
-; NO-SIMD128-NEXT:    i32.mul $push19=, $pop18, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop21), $pop19
-; NO-SIMD128-NEXT:    i32.const $push25=, 10
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.store16 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $20
-; NO-SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop27
-; NO-SIMD128-NEXT:    i32.store16 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $19
+; NO-SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $18
+; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $17
+; NO-SIMD128-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_s_v8i16:
@@ -9515,34 +7947,26 @@ define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push6=, $19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $4
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $20
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $21
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $pop15, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $22
-; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
-; NO-SIMD128-FAST-NEXT:    i32.mul $push26=, $pop25, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push30=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $24
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $20
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $21
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $22
+; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $23
+; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $24
+; NO-SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -9572,46 +7996,38 @@ define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: extmul_high_s_v8i16:
 ; NO-SIMD128:         .functype extmul_high_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $29
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $27
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $31
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $26
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $30
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $25
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $29
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 14
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $16
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $32
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $28
 ; NO-SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $31
-; NO-SIMD128-NEXT:    i32.mul $push19=, $pop18, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop21), $pop19
-; NO-SIMD128-NEXT:    i32.const $push25=, 10
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $30
-; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.store16 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $28
-; NO-SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop27
-; NO-SIMD128-NEXT:    i32.store16 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $27
+; NO-SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $26
+; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $25
+; NO-SIMD128-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_s_v8i16:
@@ -9629,34 +8045,26 @@ define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push6=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $28
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $29
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $pop15, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $30
-; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $31
-; NO-SIMD128-FAST-NEXT:    i32.mul $push26=, $pop25, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push30=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $32
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $28
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $30
+; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $32
+; NO-SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -9687,61 +8095,53 @@ define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128:         .functype extmul_low_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $21, $pop47
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $19, $pop45
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $18, $pop43
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $17, $pop41
-; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $24, $pop39
-; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $24, $pop39
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-NEXT:    i32.and $push18=, $23, $pop37
-; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $23, $pop37
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-NEXT:    i32.and $push24=, $6, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $22, $pop35
-; NO-SIMD128-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $22, $pop35
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-NEXT:    i32.and $push29=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-NEXT:    i32.and $push28=, $20, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $21, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $4, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $20, $pop31
+; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $3, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $19, $pop29
+; NO-SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $2, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $18, $pop27
+; NO-SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $1, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $17, $pop25
+; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_u_v8i16:
@@ -9749,60 +8149,52 @@ define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $22, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $23, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $24, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -9833,61 +8225,53 @@ define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128:         .functype extmul_high_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $29, $pop47
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $11, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $27, $pop45
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $10, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $26, $pop43
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $9, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $25, $pop41
-; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $16, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $32, $pop39
-; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop39
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $15, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-NEXT:    i32.and $push18=, $31, $pop37
-; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop37
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-NEXT:    i32.and $push24=, $14, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $14, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $30, $pop35
-; NO-SIMD128-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop35
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-NEXT:    i32.and $push29=, $12, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $13, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-NEXT:    i32.and $push28=, $28, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $12, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop31
+; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $11, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop29
+; NO-SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $10, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop27
+; NO-SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $9, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop25
+; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_u_v8i16:
@@ -9895,60 +8279,52 @@ define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $26, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $11, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $27, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $28, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $13, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $29, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $14, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $30, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $26, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $15, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $11, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $31, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $27, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $32, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $28, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $13, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $29, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $14, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $15, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $31, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $16, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $32, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -9979,16 +8355,14 @@ define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: add_v4i32:
 ; NO-SIMD128:         .functype add_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.add $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.add $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v4i32:
@@ -10000,10 +8374,8 @@ define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10025,16 +8397,14 @@ define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: sub_v4i32:
 ; NO-SIMD128:         .functype sub_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.sub $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v4i32:
@@ -10046,10 +8416,8 @@ define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10071,16 +8439,14 @@ define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: mul_v4i32:
 ; NO-SIMD128:         .functype mul_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.mul $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v4i32:
@@ -10092,10 +8458,8 @@ define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10117,20 +8481,18 @@ define <4 x i32> @min_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: min_s_v4i32:
 ; NO-SIMD128:         .functype min_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.lt_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.lt_s $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.lt_s $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.lt_s $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.lt_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.lt_s $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.lt_s $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v4i32:
@@ -10145,11 +8507,9 @@ define <4 x i32> @min_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10172,20 +8532,18 @@ define <4 x i32> @min_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: min_u_v4i32:
 ; NO-SIMD128:         .functype min_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.lt_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.lt_u $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.lt_u $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.lt_u $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.lt_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.lt_u $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.lt_u $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.lt_u $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v4i32:
@@ -10200,11 +8558,9 @@ define <4 x i32> @min_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10227,20 +8583,18 @@ define <4 x i32> @max_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: max_s_v4i32:
 ; NO-SIMD128:         .functype max_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.gt_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.gt_s $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.gt_s $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.gt_s $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.gt_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.gt_s $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.gt_s $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v4i32:
@@ -10255,11 +8609,9 @@ define <4 x i32> @max_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10282,20 +8634,18 @@ define <4 x i32> @max_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: max_u_v4i32:
 ; NO-SIMD128:         .functype max_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.gt_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.gt_u $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.gt_u $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.gt_u $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.gt_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.gt_u $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.gt_u $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.gt_u $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v4i32:
@@ -10310,11 +8660,9 @@ define <4 x i32> @max_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10337,63 +8685,59 @@ define <4 x i32> @abs_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-LABEL: abs_v4i32:
 ; NO-SIMD128:         .functype abs_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push0=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push21=, $4, $pop0
-; NO-SIMD128-NEXT:    local.tee $push20=, $5=, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.shr_s $push19=, $4, $pop0
+; NO-SIMD128-NEXT:    local.tee $push18=, $5=, $pop19
+; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop18
 ; NO-SIMD128-NEXT:    i32.sub $push2=, $pop1, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push19=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push18=, $3, $pop19
-; NO-SIMD128-NEXT:    local.tee $push17=, $4=, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push17=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push16=, $3, $pop17
+; NO-SIMD128-NEXT:    local.tee $push15=, $4=, $pop16
+; NO-SIMD128-NEXT:    i32.xor $push3=, $3, $pop15
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop3, $4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push14=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push13=, $2, $pop14
+; NO-SIMD128-NEXT:    local.tee $push12=, $4=, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push5=, $2, $pop12
 ; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $4
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push16=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push15=, $2, $pop16
-; NO-SIMD128-NEXT:    local.tee $push14=, $4=, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push7=, $2, $pop14
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push11=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push10=, $1, $pop11
+; NO-SIMD128-NEXT:    local.tee $push9=, $4=, $pop10
+; NO-SIMD128-NEXT:    i32.xor $push7=, $1, $pop9
 ; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push13=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push12=, $1, $pop13
-; NO-SIMD128-NEXT:    local.tee $push11=, $4=, $pop12
-; NO-SIMD128-NEXT:    i32.xor $push9=, $1, $pop11
-; NO-SIMD128-NEXT:    i32.sub $push10=, $pop9, $4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop10
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v4i32:
 ; NO-SIMD128-FAST:         .functype abs_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push20=, $5=, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push19=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push18=, $5=, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop1, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $2, $pop19
-; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $1=, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    local.tee $push15=, $1=, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $3, $pop16
-; NO-SIMD128-FAST-NEXT:    local.tee $push14=, $2=, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $3, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push13=, $3, $pop14
+; NO-SIMD128-FAST-NEXT:    local.tee $push12=, $2=, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $3, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $4, $pop13
-; NO-SIMD128-FAST-NEXT:    local.tee $push11=, $0=, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop7, $0
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $4, $pop11
+; NO-SIMD128-FAST-NEXT:    local.tee $push9=, $3=, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop7, $3
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> zeroinitializer, %x
   %b = icmp slt <4 x i32> %x, zeroinitializer
@@ -10418,19 +8762,17 @@ define <4 x i32> @neg_v4i32(<4 x i32> %x) {
 ; NO-SIMD128:         .functype neg_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $3
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop9, $2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop8, $1
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $4
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop7, $4
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop7, $3
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop5, $1
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v4i32:
@@ -10439,17 +8781,15 @@ define <4 x i32> @neg_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop9, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop7, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop8, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop7, $4
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop5, $4
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> <i32 0, i32 0, i32 0, i32 0>, %x
   ret <4 x i32> %a
@@ -10471,16 +8811,14 @@ define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shl_v4i32:
 ; NO-SIMD128:         .functype shl_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shl $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shl $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shl $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v4i32:
@@ -10492,10 +8830,8 @@ define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10523,19 +8859,17 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
 ; NO-SIMD128:         .functype shl_const_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $pop8
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.shl $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $pop7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $pop6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $pop5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v4i32:
@@ -10544,17 +8878,15 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <4 x i32> %v, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %a
@@ -10606,16 +8938,14 @@ define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shl_vec_v4i32:
 ; NO-SIMD128:         .functype shl_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shl $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shl $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shl $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v4i32:
@@ -10627,10 +8957,8 @@ define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10652,16 +8980,14 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shr_s_v4i32:
 ; NO-SIMD128:         .functype shr_s_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_s $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_s $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_s $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_s $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v4i32:
@@ -10673,10 +8999,8 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10731,16 +9055,14 @@ define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v4i32:
 ; NO-SIMD128:         .functype shr_s_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_s $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_s $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v4i32:
@@ -10752,10 +9074,8 @@ define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10777,16 +9097,14 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shr_u_v4i32:
 ; NO-SIMD128:         .functype shr_u_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_u $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_u $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_u $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_u $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v4i32:
@@ -10798,10 +9116,8 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10856,16 +9172,14 @@ define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shr_u_vec_v4i32:
 ; NO-SIMD128:         .functype shr_u_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_u $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_u $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v4i32:
@@ -10877,10 +9191,8 @@ define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10902,16 +9214,14 @@ define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: and_v4i32:
 ; NO-SIMD128:         .functype and_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.and $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v4i32:
@@ -10923,10 +9233,8 @@ define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10948,16 +9256,14 @@ define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: or_v4i32:
 ; NO-SIMD128:         .functype or_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.or $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.or $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v4i32:
@@ -10969,10 +9275,8 @@ define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10994,16 +9298,14 @@ define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: xor_v4i32:
 ; NO-SIMD128:         .functype xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.xor $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v4i32:
@@ -11015,10 +9317,8 @@ define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -11041,19 +9341,17 @@ define <4 x i32> @not_v4i32(<4 x i32> %x) {
 ; NO-SIMD128:         .functype not_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $pop8
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $pop7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $pop6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $pop5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v4i32:
@@ -11062,17 +9360,15 @@ define <4 x i32> @not_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   ret <4 x i32> %a
@@ -11096,23 +9392,21 @@ define <4 x i32> @andnot_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128:         .functype andnot_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $7, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push13=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $6, $pop13
-; NO-SIMD128-NEXT:    i32.and $push4=, $2, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push12=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $5, $pop12
-; NO-SIMD128-NEXT:    i32.and $push6=, $1, $pop5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $4, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    i32.const $push11=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $8, $pop11
-; NO-SIMD128-NEXT:    i32.and $push8=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $pop11
+; NO-SIMD128-NEXT:    i32.and $push4=, $3, $pop3
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $6, $pop10
+; NO-SIMD128-NEXT:    i32.and $push6=, $2, $pop5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $5, $pop9
+; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop7
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v4i32:
@@ -11122,20 +9416,18 @@ define <4 x i32> @andnot_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $5, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push12=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $7, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $7, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
  %a = and <4 x i32> %x, %inv_y
@@ -11161,32 +9453,30 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v4i32:
 ; NO-SIMD128:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $4, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $12
 ; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop3, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $3, $pop21
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $11
-; NO-SIMD128-NEXT:    i32.and $push7=, $3, $7
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop9, $pop7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push12=, $2, $pop20
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $10
-; NO-SIMD128-NEXT:    i32.and $push11=, $2, $6
-; NO-SIMD128-NEXT:    i32.or $push14=, $pop13, $pop11
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop14
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push16=, $1, $pop19
-; NO-SIMD128-NEXT:    i32.and $push17=, $pop16, $9
-; NO-SIMD128-NEXT:    i32.and $push15=, $1, $5
-; NO-SIMD128-NEXT:    i32.or $push18=, $pop17, $pop15
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop18
+; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $11
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $7
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop7, $pop5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $10
+; NO-SIMD128-NEXT:    i32.and $push9=, $2, $6
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop11, $pop9
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $9
+; NO-SIMD128-NEXT:    i32.and $push13=, $1, $5
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop15, $pop13
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v4i32:
@@ -11198,26 +9488,24 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop3, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $10
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $6
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop7, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $11
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop11, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $12
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop15, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <4 x i32> %c, %v1
   %inv_mask = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %c
@@ -11244,24 +9532,22 @@ define <4 x i32> @bitselect_xor_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v4i32:
 ; NO-SIMD128:         .functype bitselect_xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $12
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $4
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $12
-; NO-SIMD128-NEXT:    i32.store 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $11
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $3
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $11
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
-; NO-SIMD128-NEXT:    i32.xor $push8=, $6, $10
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $2
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $10
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop10
-; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $9
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $pop12, $9
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop13
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $11
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $3
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $6, $10
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $5, $9
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v4i32:
@@ -11279,12 +9565,10 @@ define <4 x i32> @bitselect_xor_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $8, $12
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $12
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <4 x i32> %v1, %v2
  %and = and <4 x i32> %xor1, %c
@@ -11311,32 +9595,30 @@ define <4 x i32> @bitselect_xor_reversed_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v4i32:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $12
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $12
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $11
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $3, $pop21
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $11
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop10
-; NO-SIMD128-NEXT:    i32.xor $push12=, $6, $10
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $2, $pop20
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $10
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop14
-; NO-SIMD128-NEXT:    i32.xor $push16=, $5, $9
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $11
 ; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $1, $pop19
-; NO-SIMD128-NEXT:    i32.and $push17=, $pop16, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop18
+; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $10
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $9
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v4i32:
@@ -11349,25 +9631,23 @@ define <4 x i32> @bitselect_xor_reversed_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $10
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $7, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $8, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $12
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <4 x i32> %v1, %v2
  %notc = xor <4 x i32> %c, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -11394,24 +9674,22 @@ define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: extmul_low_s_v4i32:
 ; NO-SIMD128:         .functype extmul_low_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $11
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $12
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $10
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $11
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $9
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $10
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 12
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $12
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $9
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_s_v4i32:
@@ -11429,12 +9707,10 @@ define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push6=, $11
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $4
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $12
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $12
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -11464,24 +9740,22 @@ define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: extmul_high_s_v4i32:
 ; NO-SIMD128:         .functype extmul_high_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $15
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $14
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $15
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $13
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $14
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 12
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $8
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $16
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $13
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_s_v4i32:
@@ -11499,12 +9773,10 @@ define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push6=, $15
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $16
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $16
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -11535,31 +9807,29 @@ define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128:         .functype extmul_low_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $11, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $2, $pop20
+; NO-SIMD128-NEXT:    i32.and $push2=, $4, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $10, $pop19
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push1=, $12, $pop19
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop18
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop18
 ; NO-SIMD128-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $9, $pop17
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop17
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $4, $pop16
+; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop16
 ; NO-SIMD128-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $12, $pop15
+; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop14
+; NO-SIMD128-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop13
 ; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop12
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_u_v4i32:
@@ -11567,30 +9837,28 @@ define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -11621,31 +9889,29 @@ define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128:         .functype extmul_high_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $7, $pop0
-; NO-SIMD128-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $15, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $6, $pop20
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop19
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $5, $pop18
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop18
 ; NO-SIMD128-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop17
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop17
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $8, $pop16
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop16
 ; NO-SIMD128-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $16, $pop15
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop14
+; NO-SIMD128-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop13
 ; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop12
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_u_v4i32:
@@ -11653,30 +9919,28 @@ define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $13, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $13, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $14, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $7, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $15, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $14, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $8, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $7, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $15, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $8, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $16, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -13061,16 +11325,14 @@ define <4 x float> @neg_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: neg_v4f32:
 ; NO-SIMD128:         .functype neg_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.neg $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.neg $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.neg $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.neg $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.neg $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.neg $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.neg $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.neg $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v4f32:
@@ -13082,10 +11344,8 @@ define <4 x float> @neg_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.neg $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.neg $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.neg $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fsub nsz <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, %x
   ret <4 x float> %a
@@ -13108,16 +11368,14 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: abs_v4f32:
 ; NO-SIMD128:         .functype abs_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.abs $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.abs $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.abs $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.abs $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.abs $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.abs $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.abs $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.abs $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v4f32:
@@ -13129,10 +11387,8 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.abs $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.abs $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.abs $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
   ret <4 x float> %a
@@ -13157,54 +11413,50 @@ define <4 x float> @min_unordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype min_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.gt $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.gt $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.gt $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.gt $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_unordered_v4f32:
 ; NO-SIMD128-FAST:         .functype min_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.gt $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.gt $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.gt $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.gt $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ule <4 x float> %x, <float 5., float 5., float 5., float 5.>
   %a = select <4 x i1> %cmps, <4 x float> %x,
@@ -13231,54 +11483,50 @@ define <4 x float> @max_unordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype max_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.lt $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.lt $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.lt $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.lt $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_unordered_v4f32:
 ; NO-SIMD128-FAST:         .functype max_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.lt $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.lt $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.lt $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.lt $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp uge <4 x float> %x, <float 5., float 5., float 5., float 5.>
   %a = select <4 x i1> %cmps, <4 x float> %x,
@@ -13305,54 +11553,50 @@ define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype min_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.ge $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.ge $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.ge $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.ge $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_ordered_v4f32:
 ; NO-SIMD128-FAST:         .functype min_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.ge $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.ge $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.ge $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.ge $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ole <4 x float> <float 5., float 5., float 5., float 5.>, %x
   %a = select <4 x i1> %cmps,
@@ -13379,54 +11623,50 @@ define <4 x float> @max_ordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype max_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.le $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.le $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.le $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.le $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_ordered_v4f32:
 ; NO-SIMD128-FAST:         .functype max_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.le $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.le $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.le $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.le $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp oge <4 x float> <float 5., float 5., float 5., float 5.>, %x
   %a = select <4 x i1> %cmps,
@@ -13451,16 +11691,14 @@ define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: min_intrinsic_v4f32:
 ; NO-SIMD128:         .functype min_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.min $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.min $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.min $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.min $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.min $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.min $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.min $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.min $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_intrinsic_v4f32:
@@ -13472,10 +11710,8 @@ define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.min $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.min $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.min $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13552,16 +11788,14 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: minnum_intrinsic_v4f32:
 ; NO-SIMD128:         .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fminf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fminf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fminf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: minnum_intrinsic_v4f32:
@@ -13573,10 +11807,8 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13598,16 +11830,14 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: minnum_nsz_intrinsic_v4f32:
 ; NO-SIMD128:         .functype minnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fminf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fminf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fminf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: minnum_nsz_intrinsic_v4f32:
@@ -13619,10 +11849,8 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan nsz <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13647,19 +11875,17 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128:         .functype fminnumv432_non_zero_intrinsic (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $pop9
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    call $push1=, fminf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
-; NO-SIMD128-NEXT:    call $push6=, fminf, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-NEXT:    call $push2=, fminf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push3=, fminf, $2, $pop6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    f32.const $push5=, -0x1p0
+; NO-SIMD128-NEXT:    call $push4=, fminf, $1, $pop5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: fminnumv432_non_zero_intrinsic:
@@ -13668,17 +11894,15 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, -0x1p0
 ; NO-SIMD128-FAST-NEXT:    call $push1=, fminf, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push6=, fminf, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push5=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float -1.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -13755,19 +11979,17 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128:         .functype fminnumv432_one_zero_intrinsic (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x0p0
-; NO-SIMD128-NEXT:    call $push3=, fminf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fminf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fminf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fminf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fminf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x0p0
+; NO-SIMD128-NEXT:    call $push4=, fminf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fminf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: fminnumv432_one_zero_intrinsic:
@@ -13779,14 +12001,12 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x0p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fminf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 0.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -13809,16 +12029,14 @@ define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: max_intrinsic_v4f32:
 ; NO-SIMD128:         .functype max_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.max $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.max $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.max $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.max $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.max $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.max $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.max $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.max $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_intrinsic_v4f32:
@@ -13830,10 +12048,8 @@ define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.max $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.max $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.max $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13910,16 +12126,14 @@ define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: maxnum_intrinsic_v4f32:
 ; NO-SIMD128:         .functype maxnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fmaxf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fmaxf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fmaxf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fmaxf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_intrinsic_v4f32:
@@ -13931,10 +12145,8 @@ define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fmaxf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13956,16 +12168,14 @@ define <4 x float> @maxnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: maxnum_nsz_intrinsic_v4f32:
 ; NO-SIMD128:         .functype maxnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fmaxf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fmaxf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fmaxf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fmaxf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_nsz_intrinsic_v4f32:
@@ -13977,10 +12187,8 @@ define <4 x float> @maxnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fmaxf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan nsz <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -14057,19 +12265,17 @@ define <4 x float> @maxnum_one_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128:         .functype maxnum_one_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x0p0
-; NO-SIMD128-NEXT:    call $push3=, fmaxf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fmaxf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x0p0
+; NO-SIMD128-NEXT:    call $push4=, fmaxf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fmaxf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_one_zero_intrinsic_v4f32:
@@ -14081,14 +12287,12 @@ define <4 x float> @maxnum_one_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x0p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 0.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -14113,19 +12317,17 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128:         .functype maxnum_non_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x1p0
-; NO-SIMD128-NEXT:    call $push3=, fmaxf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fmaxf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x1p0
+; NO-SIMD128-NEXT:    call $push4=, fmaxf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fmaxf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_non_zero_intrinsic_v4f32:
@@ -14137,14 +12339,12 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x1p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 1.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -14240,20 +12440,18 @@ define <4 x float> @pmin_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: pmin_v4f32:
 ; NO-SIMD128:         .functype pmin_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.lt $push0=, $7, $3
-; NO-SIMD128-NEXT:    f32.select $push1=, $7, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.lt $push2=, $6, $2
-; NO-SIMD128-NEXT:    f32.select $push3=, $6, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.lt $push4=, $5, $1
-; NO-SIMD128-NEXT:    f32.select $push5=, $5, $1, $pop4
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    f32.lt $push6=, $8, $4
-; NO-SIMD128-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    f32.lt $push0=, $8, $4
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $7, $3
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $6, $2
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.lt $push6=, $5, $1
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmin_v4f32:
@@ -14268,11 +12466,9 @@ define <4 x float> @pmin_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $7, $3
 ; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $8, $4
 ; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <4 x float> %y, %x
   %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
@@ -14295,28 +12491,26 @@ define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: pmin_int_v4f32:
 ; NO-SIMD128:         .functype pmin_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push1=, $8
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push0=, $4
 ; NO-SIMD128-NEXT:    f32.lt $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $4, $pop2
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push7=, $7
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push6=, $3
-; NO-SIMD128-NEXT:    f32.lt $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $3, $pop8
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push11=, $6
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push10=, $2
-; NO-SIMD128-NEXT:    f32.lt $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.select $push13=, $6, $2, $pop12
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop13
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push15=, $5
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push14=, $1
-; NO-SIMD128-NEXT:    f32.lt $push16=, $pop15, $pop14
-; NO-SIMD128-NEXT:    i32.select $push17=, $5, $1, $pop16
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push5=, $7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push4=, $3
+; NO-SIMD128-NEXT:    f32.lt $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $3, $pop6
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push9=, $6
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push8=, $2
+; NO-SIMD128-NEXT:    f32.lt $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop11
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push13=, $5
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push12=, $1
+; NO-SIMD128-NEXT:    f32.lt $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $1, $pop14
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmin_int_v4f32:
@@ -14337,13 +12531,11 @@ define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $7, $3, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push13=, $8
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push12=, $4
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $8, $4, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <4 x i32> %x to <4 x float>
   %fy = bitcast <4 x i32> %y to <4 x float>
@@ -14368,20 +12560,18 @@ define <4 x float> @pmax_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: pmax_v4f32:
 ; NO-SIMD128:         .functype pmax_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.lt $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.select $push1=, $7, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.lt $push2=, $2, $6
-; NO-SIMD128-NEXT:    f32.select $push3=, $6, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.lt $push4=, $1, $5
-; NO-SIMD128-NEXT:    f32.select $push5=, $5, $1, $pop4
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    f32.lt $push6=, $4, $8
-; NO-SIMD128-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    f32.lt $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.lt $push6=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmax_v4f32:
@@ -14396,11 +12586,9 @@ define <4 x float> @pmax_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <4 x float> %x, %y
   %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
@@ -14423,28 +12611,26 @@ define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: pmax_int_v4f32:
 ; NO-SIMD128:         .functype pmax_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push1=, $4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push0=, $8
 ; NO-SIMD128-NEXT:    f32.lt $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $4, $pop2
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push7=, $3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push6=, $7
-; NO-SIMD128-NEXT:    f32.lt $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $3, $pop8
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push11=, $2
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push10=, $6
-; NO-SIMD128-NEXT:    f32.lt $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.select $push13=, $6, $2, $pop12
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop13
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push15=, $1
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push14=, $5
-; NO-SIMD128-NEXT:    f32.lt $push16=, $pop15, $pop14
-; NO-SIMD128-NEXT:    i32.select $push17=, $5, $1, $pop16
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push5=, $3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push4=, $7
+; NO-SIMD128-NEXT:    f32.lt $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $3, $pop6
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push9=, $2
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push8=, $6
+; NO-SIMD128-NEXT:    f32.lt $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop11
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push13=, $1
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push12=, $5
+; NO-SIMD128-NEXT:    f32.lt $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $1, $pop14
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmax_int_v4f32:
@@ -14465,13 +12651,11 @@ define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $7, $3, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push12=, $8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $8, $4, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <4 x i32> %x to <4 x float>
   %fy = bitcast <4 x i32> %y to <4 x float>
@@ -14496,16 +12680,14 @@ define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: add_v4f32:
 ; NO-SIMD128:         .functype add_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.add $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.add $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.add $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.add $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.add $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.add $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.add $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.add $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v4f32:
@@ -14517,10 +12699,8 @@ define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.add $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.add $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.add $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fadd <4 x float> %x, %y
   ret <4 x float> %a
@@ -14542,16 +12722,14 @@ define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: sub_v4f32:
 ; NO-SIMD128:         .functype sub_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.sub $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.sub $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.sub $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.sub $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.sub $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.sub $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.sub $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.sub $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v4f32:
@@ -14563,10 +12741,8 @@ define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.sub $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.sub $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.sub $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fsub <4 x float> %x, %y
   ret <4 x float> %a
@@ -14588,16 +12764,14 @@ define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: div_v4f32:
 ; NO-SIMD128:         .functype div_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.div $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.div $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.div $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.div $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.div $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.div $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.div $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.div $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: div_v4f32:
@@ -14609,10 +12783,8 @@ define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.div $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.div $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.div $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fdiv <4 x float> %x, %y
   ret <4 x float> %a
@@ -14634,16 +12806,14 @@ define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: mul_v4f32:
 ; NO-SIMD128:         .functype mul_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.mul $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.mul $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.mul $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.mul $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.mul $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.mul $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.mul $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.mul $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v4f32:
@@ -14655,10 +12825,8 @@ define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.mul $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.mul $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.mul $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fmul <4 x float> %x, %y
   ret <4 x float> %a
@@ -14681,16 +12849,14 @@ define <4 x float> @sqrt_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: sqrt_v4f32:
 ; NO-SIMD128:         .functype sqrt_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.sqrt $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.sqrt $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.sqrt $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.sqrt $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.sqrt $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.sqrt $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.sqrt $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.sqrt $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sqrt_v4f32:
@@ -14702,10 +12868,8 @@ define <4 x float> @sqrt_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.sqrt $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.sqrt $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.sqrt $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
   ret <4 x float> %a
diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll
index d2a38de4cc85..5ec9f6a2a321 100644
--- a/llvm/test/CodeGen/WebAssembly/simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd.ll
@@ -38,44 +38,22 @@ define <16 x i8> @splat_v16i8(i8 %x) {
 ; NO-SIMD128-LABEL: splat_v16i8:
 ; NO-SIMD128:         .functype splat_v16i8 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $1
+; NO-SIMD128-NEXT:    i32.store8 14($0), $1
+; NO-SIMD128-NEXT:    i32.store8 13($0), $1
+; NO-SIMD128-NEXT:    i32.store8 12($0), $1
+; NO-SIMD128-NEXT:    i32.store8 11($0), $1
+; NO-SIMD128-NEXT:    i32.store8 10($0), $1
+; NO-SIMD128-NEXT:    i32.store8 9($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $1
+; NO-SIMD128-NEXT:    i32.store8 7($0), $1
+; NO-SIMD128-NEXT:    i32.store8 6($0), $1
+; NO-SIMD128-NEXT:    i32.store8 5($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $1
+; NO-SIMD128-NEXT:    i32.store8 3($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $1
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $1
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $1
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $1
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $1
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $1
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $1
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $1
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $1
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $1
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <16 x i8> undef, i8 %x, i32 0
   %res = shufflevector <16 x i8> %v, <16 x i8> undef,
@@ -356,44 +334,22 @@ define <16 x i8> @replace_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: replace_v16i8:
 ; NO-SIMD128:         .functype replace_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $17
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $17
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 11
   ret <16 x i8> %res
@@ -461,44 +417,22 @@ define <16 x i8> @replace_zero_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v16i8:
 ; NO-SIMD128:         .functype replace_zero_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $12
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $17
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $12
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 0
   ret <16 x i8> %res
@@ -514,44 +448,22 @@ define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: shuffle_v16i8:
 ; NO-SIMD128:         .functype shuffle_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $32
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $30
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $28
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $26
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $24
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $22
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $20
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $18
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $32
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $30
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $28
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $26
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $24
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $22
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $20
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <16 x i8> %x, <16 x i8> %y,
     <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23,
@@ -569,44 +481,22 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v16i8:
 ; NO-SIMD128:         .functype shuffle_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $2
+; NO-SIMD128-NEXT:    i32.store8 14($0), $2
+; NO-SIMD128-NEXT:    i32.store8 13($0), $2
+; NO-SIMD128-NEXT:    i32.store8 12($0), $2
+; NO-SIMD128-NEXT:    i32.store8 11($0), $2
+; NO-SIMD128-NEXT:    i32.store8 10($0), $2
+; NO-SIMD128-NEXT:    i32.store8 9($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $2
+; NO-SIMD128-NEXT:    i32.store8 7($0), $2
+; NO-SIMD128-NEXT:    i32.store8 6($0), $2
+; NO-SIMD128-NEXT:    i32.store8 5($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $2
+; NO-SIMD128-NEXT:    i32.store8 3($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $2
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $2
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $2
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $2
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $2
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $2
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $2
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $2
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $2
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $2
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <16 x i8> %x, <16 x i8> %y,
     <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
@@ -641,44 +531,22 @@ define <16 x i8> @build_v16i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3,
 ; NO-SIMD128-LABEL: build_v16i8:
 ; NO-SIMD128:         .functype build_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $12
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $12
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
                               i8 %x4, i8 %x5, i8 %x6, i8 %x7,
                               i8 %x8, i8 %x9, i8 %x10, i8 %x11,
@@ -734,22 +602,14 @@ define <8 x i16> @splat_v8i16(i16 %x) {
 ; NO-SIMD128-LABEL: splat_v8i16:
 ; NO-SIMD128:         .functype splat_v8i16 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $1
+; NO-SIMD128-NEXT:    i32.store16 12($0), $1
+; NO-SIMD128-NEXT:    i32.store16 10($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $1
+; NO-SIMD128-NEXT:    i32.store16 6($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $1
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $1
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $1
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <8 x i16> undef, i16 %x, i32 0
   %res = shufflevector <8 x i16> %v, <8 x i16> undef,
@@ -1016,22 +876,14 @@ define <8 x i16> @replace_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: replace_v8i16:
 ; NO-SIMD128:         .functype replace_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $9
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $9
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 7
   ret <8 x i16> %res
@@ -1095,22 +947,14 @@ define <8 x i16> @replace_zero_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v8i16:
 ; NO-SIMD128:         .functype replace_zero_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $9
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $8
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 0
   ret <8 x i16> %res
@@ -1126,22 +970,14 @@ define <8 x i16> @shuffle_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: shuffle_v8i16:
 ; NO-SIMD128:         .functype shuffle_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $16
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $14
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $12
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $10
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $12
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <8 x i16> %x, <8 x i16> %y,
     <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
@@ -1158,22 +994,14 @@ define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v8i16:
 ; NO-SIMD128:         .functype shuffle_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $2
+; NO-SIMD128-NEXT:    i32.store16 12($0), $2
+; NO-SIMD128-NEXT:    i32.store16 10($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $2
+; NO-SIMD128-NEXT:    i32.store16 6($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $2
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $2
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $2
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <8 x i16> %x, <8 x i16> %y,
     <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
@@ -1198,22 +1026,14 @@ define <8 x i16> @build_v8i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3,
 ; NO-SIMD128-LABEL: build_v8i16:
 ; NO-SIMD128:         .functype build_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $8
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
                               i16 %x4, i16 %x5, i16 %x6, i16 %x7) {
   %t0 = insertelement <8 x i16> undef, i16 %x0, i32 0
@@ -1258,12 +1078,10 @@ define <4 x i32> @splat_v4i32(i32 %x) {
 ; NO-SIMD128-LABEL: splat_v4i32:
 ; NO-SIMD128:         .functype splat_v4i32 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $1
 ; NO-SIMD128-NEXT:    i32.store 8($0), $1
 ; NO-SIMD128-NEXT:    i32.store 4($0), $1
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <4 x i32> undef, i32 %x, i32 0
   %res = shufflevector <4 x i32> %v, <4 x i32> undef,
@@ -1368,12 +1186,10 @@ define <4 x i32> @replace_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: replace_v4i32:
 ; NO-SIMD128:         .functype replace_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $5
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 2
   ret <4 x i32> %res
@@ -1433,12 +1249,10 @@ define <4 x i32> @replace_zero_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v4i32:
 ; NO-SIMD128:         .functype replace_zero_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $5
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 0
   ret <4 x i32> %res
@@ -1454,12 +1268,10 @@ define <4 x i32> @shuffle_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: shuffle_v4i32:
 ; NO-SIMD128:         .functype shuffle_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $8
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $6
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $8
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x i32> %x, <4 x i32> %y,
     <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1476,12 +1288,10 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v4i32:
 ; NO-SIMD128:         .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $2
 ; NO-SIMD128-NEXT:    i32.store 8($0), $2
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x i32> %x, <4 x i32> %y,
     <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1501,12 +1311,10 @@ define <4 x i32> @build_v4i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
 ; NO-SIMD128-LABEL: build_v4i32:
 ; NO-SIMD128:         .functype build_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %t0 = insertelement <4 x i32> undef, i32 %x0, i32 0
   %t1 = insertelement <4 x i32> %t0, i32 %x1, i32 1
@@ -1801,12 +1609,10 @@ define <4 x float> @splat_v4f32(float %x) {
 ; NO-SIMD128-LABEL: splat_v4f32:
 ; NO-SIMD128:         .functype splat_v4f32 (i32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $1
 ; NO-SIMD128-NEXT:    f32.store 8($0), $1
 ; NO-SIMD128-NEXT:    f32.store 4($0), $1
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <4 x float> undef, float %x, i32 0
   %res = shufflevector <4 x float> %v, <4 x float> undef,
@@ -1911,12 +1717,10 @@ define <4 x float> @replace_v4f32(<4 x float> %v, float %x) {
 ; NO-SIMD128-LABEL: replace_v4f32:
 ; NO-SIMD128:         .functype replace_v4f32 (i32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $5
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 2
   ret <4 x float> %res
@@ -1976,12 +1780,10 @@ define <4 x float> @replace_zero_v4f32(<4 x float> %v, float %x) {
 ; NO-SIMD128-LABEL: replace_zero_v4f32:
 ; NO-SIMD128:         .functype replace_zero_v4f32 (i32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $5
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 0
   ret <4 x float> %res
@@ -1997,12 +1799,10 @@ define <4 x float> @shuffle_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: shuffle_v4f32:
 ; NO-SIMD128:         .functype shuffle_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $8
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $6
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $8
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x float> %x, <4 x float> %y,
     <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -2019,12 +1819,10 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v4f32:
 ; NO-SIMD128:         .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $2
 ; NO-SIMD128-NEXT:    f32.store 8($0), $2
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x float> %x, <4 x float> %y,
     <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -2044,12 +1842,10 @@ define <4 x float> @build_v4f32(float %x0, float %x1, float %x2, float %x3) {
 ; NO-SIMD128-LABEL: build_v4f32:
 ; NO-SIMD128:         .functype build_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %t0 = insertelement <4 x float> undef, float %x0, i32 0
   %t1 = insertelement <4 x float> %t0, float %x1, i32 1
diff --git a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
index 609be3bb2e54..50e736ac68d2 100644
--- a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s | FileCheck %s
 
 ; Check that the shr(shl X, 56), 48) is not mistakenly turned into
@@ -16,11 +17,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 define i64 @foo(i64 %b) nounwind readnone {
-entry:
 ; CHECK-LABEL: foo:
-; CHECK: movsbq %dil, %rax
-; CHECK: shlq $8, %rax
-; CHECK: orq $1, %rax
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movsbq %dil, %rax
+; CHECK-NEXT:    shlq $8, %rax
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    retq
+entry:
 	%shl = shl i64 %b, 56		; <i64> [#uses=1]
 	%shr = ashr i64 %shl, 48		; <i64> [#uses=1]
 	%add5 = or i64 %shr, 1		; <i64> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
index 548cf24b9200..13c958587327 100644
--- a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
+++ b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
@@ -869,13 +869,13 @@ body: |
   $ymm0 = VSHUFPSZ256rmi                       $ymm0, $rdi, 1, $noreg, 0, $noreg, -24
   ; CHECK: $ymm0 = VSHUFPSYrri                 $ymm0, $ymm1, -24
   $ymm0 = VSHUFPSZ256rri                       $ymm0, $ymm1, -24
-  ; CHECK: $ymm0 = VROUNDPDYm                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPDYmi                 $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPDZ256rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPDYr                  $ymm0, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPDYri                 $ymm0, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPDZ256rri                   $ymm0, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPSYm                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPSYmi                 $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPSZ256rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPSYr                  $ymm0, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPSYri                 $ymm0, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPSZ256rri                   $ymm0, 15, implicit $mxcsr
   ; CHECK: $ymm0 = VPERM2F128rm                $ymm0, $rip, 1, $noreg, 0, $noreg, 32
   $ymm0 = VSHUFF32X4Z256rmi                    $ymm0, $rip, 1, $noreg, 0, $noreg, 228
@@ -1751,13 +1751,13 @@ body: |
   $xmm0 = VALIGNQZ128rmi                       $xmm0, $rip, 1, $noreg, 0, $noreg, 1
   ; CHECK: $xmm0 = VPALIGNRrri                 $xmm0, $xmm1, 8
   $xmm0 = VALIGNQZ128rri                       $xmm0, $xmm1, 1
-  ; CHECK: $xmm0 = VROUNDPDm                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPDmi                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPDZ128rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPDr                   $xmm0, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPDri                  $xmm0, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPDZ128rri                   $xmm0, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPSm                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPSmi                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPSZ128rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPSr                   $xmm0, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPSri                  $xmm0, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPSZ128rri                   $xmm0, 15, implicit $mxcsr
 
   RET64
@@ -2308,21 +2308,21 @@ body: |
   $xmm0 = VINSERTPSZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg, 1
   ; CHECK: $xmm0 = VINSERTPSrr                 $xmm0, $xmm0, 1
   $xmm0 = VINSERTPSZrr                         $xmm0, $xmm0, 1
-  ; CHECK: $xmm0 = VROUNDSDm                   $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDmi                  $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZm                        $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDr                   $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDri                  $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZr                        $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSm                   $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSmi                  $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZm                        $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSr                   $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSri                  $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZr                        $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDm_Int               $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDmi_Int              $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZm_Int                    $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDr_Int               $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDri_Int              $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZr_Int                    $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSm_Int               $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSmi_Int              $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZm_Int                    $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSr_Int               $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSri_Int              $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZr_Int                    $xmm0, $xmm1, 15, implicit $mxcsr
 
   RET64
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index d9ee5f0d3e49..ee7f4aea02c0 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -173,16 +173,14 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    vmovdqa (%edx), %xmm0
 ; X86-NEXT:    vpand (%ecx), %xmm0, %xmm0
-; X86-NEXT:    vpextrb $6, %xmm0, %ecx
-; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_extractelement:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa (%rdi), %xmm0
 ; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
-; X64-NEXT:    vpextrb $6, %xmm0, %eax
-; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
   %i0 = load <16 x i8>, ptr %origin0
   %i1 = load <16 x i8>, ptr %origin1
diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll
index 8fa7ce066453..eb5d172a3b35 100644
--- a/llvm/test/CodeGen/X86/load-local-v3i129.ll
+++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll
@@ -12,7 +12,7 @@ define void @_start() nounwind {
 ; FAST-SHLD-NEXT:    shrq $2, %rcx
 ; FAST-SHLD-NEXT:    shldq $2, %rdx, %rcx
 ; FAST-SHLD-NEXT:    andq $-4, %rax
-; FAST-SHLD-NEXT:    orq $1, %rax
+; FAST-SHLD-NEXT:    incq %rax
 ; FAST-SHLD-NEXT:    movq %rax, -40(%rsp)
 ; FAST-SHLD-NEXT:    movq %rcx, -32(%rsp)
 ; FAST-SHLD-NEXT:    orq $-2, -56(%rsp)
@@ -23,7 +23,7 @@ define void @_start() nounwind {
 ; SLOW-SHLD:       # %bb.0: # %Entry
 ; SLOW-SHLD-NEXT:    movq -40(%rsp), %rax
 ; SLOW-SHLD-NEXT:    andq $-4, %rax
-; SLOW-SHLD-NEXT:    orq $1, %rax
+; SLOW-SHLD-NEXT:    incq %rax
 ; SLOW-SHLD-NEXT:    movq %rax, -40(%rsp)
 ; SLOW-SHLD-NEXT:    orq $-2, -56(%rsp)
 ; SLOW-SHLD-NEXT:    movq $-1, -48(%rsp)
diff --git a/llvm/test/CodeGen/X86/pr23664.ll b/llvm/test/CodeGen/X86/pr23664.ll
index 453e5db2bed6..8179602b8c2a 100644
--- a/llvm/test/CodeGen/X86/pr23664.ll
+++ b/llvm/test/CodeGen/X86/pr23664.ll
@@ -6,7 +6,7 @@ define i2 @f(i32 %arg) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NEXT:    leal (%rdi,%rdi), %eax
-; CHECK-NEXT:    orb $1, %al
+; CHECK-NEXT:    incb %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %trunc = trunc i32 %arg to i1
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 691ca40191d4..f7a27a5b9144 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -65,6 +65,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -74,6 +75,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
@@ -81,14 +83,15 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NEXT:    movb %al, (%edx)
+; X86-NO-BMI2-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NEXT:    movb %dl, (%eax)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -97,6 +100,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movb %cl, (%eax)
@@ -119,6 +123,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -128,6 +133,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
@@ -139,6 +145,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %edx
@@ -151,6 +158,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movw %cx, (%eax)
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll
index 8d96ab021288..f75042b1a158 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S  \
-; RUN:   -hwasan-selective-instrumentation=0 | FileCheck %s --check-prefix=FULL
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S  \
-; RUN:   -hwasan-selective-instrumentation=1 | FileCheck %s --check-prefix=SELSAN
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S                                      | FileCheck %s --check-prefix=FULL
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-percentile-cutoff-hot=990000 | FileCheck %s --check-prefix=SELSAN
 
 ; FULL: @not_sanitized
 ; FULL-NEXT: %x = alloca i8, i64 4
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
index 28e43a99883e..ab3f56dbee9e 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
@@ -1,23 +1,19 @@
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    | FileCheck %s --check-prefix=DEFAULT
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    -hwasan-percentile-cutoff-hot=700000 | FileCheck %s --check-prefix=HOT_RATE
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    -hwasan-random-skip-rate=0.0 | FileCheck %s --check-prefix=RANDOM_RATE_0
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    -hwasan-random-skip-rate=1.0 | FileCheck %s --check-prefix=RANDOM_RATE_1
-
-; DEFAULT: @sanitized
-; DEFAULT-NEXT: %x = alloca i8, i64 4
-
-; HOT_RATE: @sanitized
-; HOT_RATE-NEXT: @__hwasan_tls
-
-; RANDOM_RATE_0: @sanitized
-; RANDOM_RATE_0-NEXT: @__hwasan_tls
-
-; RANDOM_RATE_1: @sanitized
-; RANDOM_RATE_1-NEXT: %x = alloca i8, i64 4
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-percentile-cutoff-hot=700000 | FileCheck %s --check-prefix=HOT70
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-percentile-cutoff-hot=990000 | FileCheck %s --check-prefix=HOT99
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-random-skip-rate=0.0 | FileCheck %s --check-prefix=RANDOM0
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-random-skip-rate=1.0 | FileCheck %s --check-prefix=RANDOM1
+
+; HOT70: @sanitized
+; HOT70-NEXT: @__hwasan_tls
+
+; HOT99: @sanitized
+; HOT99-NEXT: %x = alloca i8, i64 4
+
+; RANDOM0: @sanitized
+; RANDOM0-NEXT: @__hwasan_tls
+
+; RANDOM1: @sanitized
+; RANDOM1-NEXT: %x = alloca i8, i64 4
 
 declare void @use(ptr)
 
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 4ab5567f6287..493350d7bd63 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -984,10 +984,10 @@ static const X86FoldTableEntry Table1[] = {
   {X86::RORX32ri_EVEX, X86::RORX32mi_EVEX, 0},
   {X86::RORX64ri, X86::RORX64mi, 0},
   {X86::RORX64ri_EVEX, X86::RORX64mi_EVEX, 0},
-  {X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16},
-  {X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16},
-  {X86::ROUNDSDr, X86::ROUNDSDm, 0},
-  {X86::ROUNDSSr, X86::ROUNDSSm, 0},
+  {X86::ROUNDPDri, X86::ROUNDPDmi, TB_ALIGN_16},
+  {X86::ROUNDPSri, X86::ROUNDPSmi, TB_ALIGN_16},
+  {X86::ROUNDSDri, X86::ROUNDSDmi, 0},
+  {X86::ROUNDSSri, X86::ROUNDSSmi, 0},
   {X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16},
   {X86::RSQRTSSr, X86::RSQRTSSm, 0},
   {X86::SAR16r1_ND, X86::SAR16m1_ND, 0},
@@ -1791,10 +1791,10 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VRNDSCALEPSZ128rri, X86::VRNDSCALEPSZ128rmi, 0},
   {X86::VRNDSCALEPSZ256rri, X86::VRNDSCALEPSZ256rmi, 0},
   {X86::VRNDSCALEPSZrri, X86::VRNDSCALEPSZrmi, 0},
-  {X86::VROUNDPDYr, X86::VROUNDPDYm, 0},
-  {X86::VROUNDPDr, X86::VROUNDPDm, 0},
-  {X86::VROUNDPSYr, X86::VROUNDPSYm, 0},
-  {X86::VROUNDPSr, X86::VROUNDPSm, 0},
+  {X86::VROUNDPDYri, X86::VROUNDPDYmi, 0},
+  {X86::VROUNDPDri, X86::VROUNDPDmi, 0},
+  {X86::VROUNDPSYri, X86::VROUNDPSYmi, 0},
+  {X86::VROUNDPSri, X86::VROUNDPSmi, 0},
   {X86::VRSQRT14PDZ128r, X86::VRSQRT14PDZ128m, 0},
   {X86::VRSQRT14PDZ256r, X86::VRSQRT14PDZ256m, 0},
   {X86::VRSQRT14PDZr, X86::VRSQRT14PDZm, 0},
@@ -2234,8 +2234,8 @@ static const X86FoldTableEntry Table2[] = {
   {X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16},
   {X86::PXORrr, X86::PXORrm, TB_ALIGN_16},
   {X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE},
-  {X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE},
-  {X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE},
+  {X86::ROUNDSDri_Int, X86::ROUNDSDmi_Int, TB_NO_REVERSE},
+  {X86::ROUNDSSri_Int, X86::ROUNDSSmi_Int, TB_NO_REVERSE},
   {X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE},
   {X86::SBB16rr, X86::SBB16rm, 0},
   {X86::SBB16rr_ND, X86::SBB16rm_ND, 0},
@@ -3778,10 +3778,10 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VRNDSCALESHZr_Int, X86::VRNDSCALESHZm_Int, TB_NO_REVERSE},
   {X86::VRNDSCALESSZr, X86::VRNDSCALESSZm, 0},
   {X86::VRNDSCALESSZr_Int, X86::VRNDSCALESSZm_Int, TB_NO_REVERSE},
-  {X86::VROUNDSDr, X86::VROUNDSDm, 0},
-  {X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE},
-  {X86::VROUNDSSr, X86::VROUNDSSm, 0},
-  {X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE},
+  {X86::VROUNDSDri, X86::VROUNDSDmi, 0},
+  {X86::VROUNDSDri_Int, X86::VROUNDSDmi_Int, TB_NO_REVERSE},
+  {X86::VROUNDSSri, X86::VROUNDSSmi, 0},
+  {X86::VROUNDSSri_Int, X86::VROUNDSSmi_Int, TB_NO_REVERSE},
   {X86::VRSQRT14PDZ128rkz, X86::VRSQRT14PDZ128mkz, 0},
   {X86::VRSQRT14PDZ256rkz, X86::VRSQRT14PDZ256mkz, 0},
   {X86::VRSQRT14PDZrkz, X86::VRSQRT14PDZmkz, 0},
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
index cc66cbe7fce6..ce1b591218d1 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
@@ -102,9 +102,9 @@ if.end8:
 define i32 @test4(i32 %c) nounwind {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 2, label [[SW_BB]]
-; CHECK-NEXT:    i32 4, label [[SW_BB]]
+; CHECK-NEXT:      i32 1, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB]]
+; CHECK-NEXT:      i32 4, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    br i1 true, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
@@ -207,8 +207,8 @@ define i1 @test7(i32 %c) nounwind {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 6, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 7, label [[SW_BB]]
+; CHECK-NEXT:      i32 6, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 7, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    ret i1 true
@@ -790,8 +790,8 @@ define i32 @test18(i8 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
-; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
-; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:      i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:      i8 -111, label [[DISPATCH]]
 ; CHECK-NEXT:    ]
 ; CHECK:       target93:
 ; CHECK-NEXT:    ret i32 93
@@ -817,8 +817,8 @@ define i8 @test19(i8 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
-; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
-; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:      i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:      i8 -111, label [[DISPATCH]]
 ; CHECK-NEXT:    ]
 ; CHECK:       target93:
 ; CHECK-NEXT:    ret i8 96
@@ -846,8 +846,8 @@ define i1 @test20(i64 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i64 [[A]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:    i64 0, label [[EXIT2:%.*]]
-; CHECK-NEXT:    i64 -2147483647, label [[EXIT2]]
+; CHECK-NEXT:      i64 0, label [[EXIT2:%.*]]
+; CHECK-NEXT:      i64 -2147483647, label [[EXIT2]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default:
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[B]], 0
@@ -1123,6 +1123,70 @@ else:
   ret i1 true
 }
 
+define i1 @icmp_eq_range_attr(i8 range(i8 1, 0) %i) {
+; CHECK-LABEL: @icmp_eq_range_attr(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_attr(i8 range(i8 -1, 1) %i) {
+; CHECK-LABEL: @neg_icmp_eq_range_attr(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+declare range(i8 1, 0) i8 @returns_non_zero_range_helper()
+declare range(i8 -1, 1) i8 @returns_contain_zero_range_helper()
+
+define i1 @icmp_eq_range_return() {
+; CHECK-LABEL: @icmp_eq_range_return(
+; CHECK-NEXT:    [[I:%.*]] = call i8 @returns_non_zero_range_helper()
+; CHECK-NEXT:    ret i1 false
+;
+  %i = call i8 @returns_non_zero_range_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_return() {
+; CHECK-LABEL: @neg_icmp_eq_range_return(
+; CHECK-NEXT:    [[I:%.*]] = call i8 @returns_contain_zero_range_helper()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i = call i8 @returns_contain_zero_range_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+declare i8 @returns_i8_helper()
+
+define i1 @icmp_eq_range_call() {
+; CHECK-LABEL: @icmp_eq_range_call(
+; CHECK-NEXT:    [[I:%.*]] = call range(i8 1, 0) i8 @returns_i8_helper()
+; CHECK-NEXT:    ret i1 false
+;
+  %i = call range(i8 1, 0) i8 @returns_i8_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_call() {
+; CHECK-LABEL: @neg_icmp_eq_range_call(
+; CHECK-NEXT:    [[I:%.*]] = call range(i8 0, 11) i8 @returns_i8_helper()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i = call range(i8 0, 11) i8 @returns_i8_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
 declare i16 @llvm.ctlz.i16(i16, i1)
 declare i16 @llvm.cttz.i16(i16, i1)
 declare i16 @llvm.ctpop.i16(i16)
diff --git a/llvm/test/Transforms/InstCombine/implies.ll b/llvm/test/Transforms/InstCombine/implies.ll
new file mode 100644
index 000000000000..c02d84d3f837
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/implies.ll
@@ -0,0 +1,424 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @or_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_implies_sle(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], 23
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %x, 23
+  %cond = icmp sle i8 %or, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_implies_sle_fail(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], -34
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %x, -34
+  %cond = icmp sle i8 %or, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_distjoint_implies_ule(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_distjoint_implies_ule(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_distjoint_implies_ule_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_distjoint_implies_ule_fail(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 28
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 28
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_prove_distjoin_implies_ule(i8 %xx, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_prove_distjoin_implies_ule(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], -16
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X]], 10
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x = and i8 %xx, -16
+  %x1 = or i8 %x, 7
+  %x2 = or i8 %x, 10
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_distjoint_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_or_distjoint_implies_sle(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_distjoint_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_or_distjoint_implies_sle_fail(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp slt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 23
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp sle i8 %y, %x2
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_addnsw_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_addnsw_implies_sle(
+; CHECK-NEXT:    [[X2:%.*]] = add nsw i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = add nsw i8 %x, 23
+  %x2 = add nsw i8 %x, 24
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_addnsw_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_addnsw_implies_sle_fail(
+; CHECK-NEXT:    [[X2:%.*]] = add nsw i8 [[X:%.*]], 23
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = add nsw i8 [[X]], 24
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = add nsw i8 %x, 24
+  %x2 = add nsw i8 %x, 23
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_ult(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_ult(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ult i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %z, %x
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_ult_fail(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_ult_fail(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Z]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ule i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %x, %z
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_slt_fail(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_slt_fail(
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp slt i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %x, %y
+  %r = icmp slt i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_or_implies_ule(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %y, %x
+  %cond = icmp uge i8 %z, %or
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_implies_false_ugt_todo(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_or_implies_false_ugt_todo(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %or = or i8 %x, %y
+  %cond = icmp ugt i8 %or, %z
+  br i1 %cond, label %T, label %F
+T:
+  ret i1 %other
+F:
+  %r = icmp ugt i8 %x, %z
+  ret i1 %r
+
+}
+
+define i1 @src_udiv_implies_ult(i8 %x, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_udiv_implies_ult(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ugt i8 %z, %x
+  br i1 %cond, label %T, label %F
+T:
+  %and = udiv i8 %x, 3
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_udiv_implies_ult2(i8 %x, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_udiv_implies_ult2(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 true
+;
+  %cond = icmp ule i8 %z, %x
+  br i1 %cond, label %T, label %F
+T:
+  ret i1 %other
+F:
+  %and = udiv i8 %x, 3
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+}
+
+define i1 @src_smin_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_smin_implies_sle(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp sle i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %um = call i8 @llvm.smin.i8(i8 %x, i8 %y)
+  %r = icmp sle i8 %um, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_umin_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_umin_implies_ule(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ule i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %um = call i8 @llvm.umin.i8(i8 %x, i8 %y)
+  %r = icmp ule i8 %um, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_umax_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_umax_implies_ule(
+; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[UM]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %um = call i8 @llvm.umax.i8(i8 %x, i8 %y)
+  %cond = icmp ule i8 %um, %z
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_smax_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_smax_implies_sle(
+; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[UM]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %um = call i8 @llvm.smax.i8(i8 %x, i8 %y)
+  %cond = icmp sle i8 %um, %z
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index 5305c78f6912..769f7661fc8d 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -124,7 +124,6 @@ exit:
   ret i8 %or2
 }
 
-
 define i8 @test_cond_and_bothways(i8 %x) {
 ; CHECK-LABEL: @test_cond_and_bothways(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 91
@@ -181,8 +180,6 @@ exit:
   ret i8 %or2
 }
 
-
-
 define i8 @test_cond_and_commuted(i8 %x, i1 %c1, i1 %c2) {
 ; CHECK-LABEL: @test_cond_and_commuted(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 3
@@ -343,7 +340,7 @@ exit:
   ret i8 %or2
 }
 
-define i32 @test_icmp_trunc1(i32 %x){
+define i32 @test_icmp_trunc1(i32 %x) {
 ; CHECK-LABEL: @test_icmp_trunc1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i16
@@ -365,7 +362,7 @@ else:
   ret i32 0
 }
 
-define i32 @test_icmp_trunc_assume(i32 %x){
+define i32 @test_icmp_trunc_assume(i32 %x) {
 ; CHECK-LABEL: @test_icmp_trunc_assume(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i16
@@ -532,7 +529,106 @@ if.else:
   ret i1 %other
 }
 
+define i8 @and_eq_bits_must_be_set(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 1
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 1
+  ret i8 %r
+}
+
+define i8 @and_eq_bits_must_be_set2(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set2(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 11
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 11
+  ret i8 %r
+}
+
+define i8 @and_eq_bits_must_be_set2_partial_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set2_partial_fail(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 111
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 111
+  ret i8 %r
+}
+
+define i8 @or_eq_bits_must_be_unset(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 0
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 3
+  ret i8 %r
+}
+
+define i8 @or_eq_bits_must_be_unset2(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset2(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 0
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 1
+  ret i8 %r
+}
 
+define i8 @or_eq_bits_must_be_unset2_partial_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset2_partial_fail(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 4
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 7
+  ret i8 %r
+}
+
+define i8 @or_ne_bits_must_be_unset2_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_ne_bits_must_be_unset2_fail(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[X]], 3
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp ne i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 3
+  ret i8 %r
+}
 
 declare void @use(i1)
 declare void @sink(i8)
diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
index 661c36038a67..a4b74aa8cc7d 100644
--- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes='instcombine<no-verify-fixpoint>' -S | FileCheck %s
 
 define i8 @zext_or_icmp_icmp(i8 %a, i8 %b) {
 ; CHECK-LABEL: @zext_or_icmp_icmp(
@@ -180,11 +180,11 @@ define i8 @PR49475_infloop(i32 %t0, i16 %insert, i64 %e, i8 %i162) {
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[SUB17]], 32
 ; CHECK-NEXT:    [[CONV18:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i64 [[XOR]], [[CONV18]]
-; CHECK-NEXT:    [[CONV19:%.*]] = zext i1 [[CMP]] to i16
-; CHECK-NEXT:    [[OR21:%.*]] = or i16 [[CONV19]], [[INSERT]]
-; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = icmp eq i16 [[OR21]], 0
+; CHECK-NEXT:    [[TRUNC44:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[TRUNC44]], [[I162]]
+; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = xor i1 [[CMP]], true
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[TOBOOL23_NOT]])
-; CHECK-NEXT:    ret i8 [[I162]]
+; CHECK-NEXT:    ret i8 [[INC]]
 ;
   %b = icmp eq i32 %t0, 0
   %b2 = icmp eq i16 %insert, 0
diff --git a/llvm/test/Transforms/InstSimplify/implies.ll b/llvm/test/Transforms/InstSimplify/implies.ll
index b70dc90da655..7e3cb656bce1 100644
--- a/llvm/test/Transforms/InstSimplify/implies.ll
+++ b/llvm/test/Transforms/InstSimplify/implies.ll
@@ -155,7 +155,13 @@ define i1 @test9(i32 %length.i, i32 %i) {
 
 define i1 @test10(i32 %length.i, i32 %x.full) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[X_FULL:%.*]], -65536
+; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X]], 100
+; CHECK-NEXT:    [[SMALL:%.*]] = or i32 [[X]], 90
+; CHECK-NEXT:    [[KNOWN:%.*]] = icmp ult i32 [[LARGE]], [[LENGTH_I:%.*]]
+; CHECK-NEXT:    [[TO_PROVE:%.*]] = icmp ult i32 [[SMALL]], [[LENGTH_I]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[KNOWN]], [[TO_PROVE]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %x = and i32 %x.full, 4294901760  ;; 4294901760 == 0xffff0000
   %large = or i32 %x, 100
@@ -166,6 +172,19 @@ define i1 @test10(i32 %length.i, i32 %x.full) {
   ret i1 %res
 }
 
+define i1 @test10_with_disjoint(i32 %length.i, i32 %x.full) {
+; CHECK-LABEL: @test10_with_disjoint(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = and i32 %x.full, 4294901760  ;; 4294901760 == 0xffff0000
+  %large = or disjoint i32 %x, 100
+  %small = or disjoint i32 %x, 90
+  %known = icmp ult i32 %large, %length.i
+  %to.prove = icmp ult i32 %small, %length.i
+  %res = icmp ule i1 %known, %to.prove
+  ret i1 %res
+}
+
 define i1 @test11(i32 %length.i, i32 %x) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X:%.*]], 100
@@ -216,7 +235,13 @@ define i1 @test13(i32 %length.i, i32 %x) {
 
 define i1 @test14(i32 %length.i, i32 %x.full) {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[X_FULL:%.*]], -61681
+; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X]], 8224
+; CHECK-NEXT:    [[SMALL:%.*]] = or i32 [[X]], 4112
+; CHECK-NEXT:    [[KNOWN:%.*]] = icmp ult i32 [[LARGE]], [[LENGTH_I:%.*]]
+; CHECK-NEXT:    [[TO_PROVE:%.*]] = icmp ult i32 [[SMALL]], [[LENGTH_I]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[KNOWN]], [[TO_PROVE]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %x = and i32 %x.full, 4294905615  ;; 4294905615 == 0xffff0f0f
   %large = or i32 %x, 8224 ;; == 0x2020
@@ -227,6 +252,19 @@ define i1 @test14(i32 %length.i, i32 %x.full) {
   ret i1 %res
 }
 
+define i1 @test14_with_disjoint(i32 %length.i, i32 %x.full) {
+; CHECK-LABEL: @test14_with_disjoint(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = and i32 %x.full, 4294905615  ;; 4294905615 == 0xffff0f0f
+  %large = or disjoint i32 %x, 8224 ;; == 0x2020
+  %small = or disjoint i32 %x, 4112 ;; == 0x1010
+  %known = icmp ult i32 %large, %length.i
+  %to.prove = icmp ult i32 %small, %length.i
+  %res = icmp ule i1 %known, %to.prove
+  ret i1 %res
+}
+
 define i1 @test15(i32 %length.i, i32 %x) {
 ; CHECK-LABEL: @test15(
 ; CHECK-NEXT:    [[LARGE:%.*]] = add nuw i32 [[X:%.*]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 3e895edcd4f4..afd49aa9f0b8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -43,9 +43,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -135,9 +134,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll
new file mode 100644
index 000000000000..2ce2a45a811a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -S < %s | FileCheck %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -S < %s | FileCheck %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
new file mode 100644
index 000000000000..5d1a471c5d16
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
@@ -0,0 +1,117 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -disable-output < %s 2>&1 | FileCheck %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; CHECK-LABEL: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF * UF
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT: Live-in vp<%2> = backedge-taken count
+; CHECK-NEXT: Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%16>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>
+; CHECK-NEXT:     EMIT vp<%5> = icmp ule ir<%iv>, vp<%2>
+; CHECK-NEXT:   Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:  <xVFxUF> pred.store: {
+; CHECK-NEXT:    pred.store.entry:
+; CHECK-NEXT:      BRANCH-ON-MASK vp<%5>
+; CHECK-NEXT:    Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.if:
+; CHECK-NEXT:      vp<%6> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
+; CHECK-NEXT:      REPLICATE store ir<%add>, ir<%arrayidx4>
+; CHECK-NEXT:    Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.continue:
+; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%14> = ir<%0>
+; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%15> = ir<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT:  for.body.2:
+; CHECK-NEXT:     EMIT vp<%16> = add vp<%3>, vp<%0>
+; CHECK-NEXT:     EMIT branch-on-count vp<%16>, vp<%1>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @safe_dep(ptr %p) {
+; CHECK-LABEL: VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF * UF
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT: Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%10>
+; CHECK-NEXT:     vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:     CLONE ir<%a1> = getelementptr ir<%p>, vp<%3>
+; CHECK-NEXT:     vp<%5> = vector-pointer ir<%a1>
+; CHECK-NEXT:     WIDEN ir<%v> = load vp<%5>
+; CHECK-NEXT:     CLONE ir<%offset> = add vp<%3>, ir<100>
+; CHECK-NEXT:     CLONE ir<%a2> = getelementptr ir<%p>, ir<%offset>
+; CHECK-NEXT:     vp<%9> = vector-pointer ir<%a2>
+; CHECK-NEXT:     WIDEN store vp<%9>, ir<%v>
+; CHECK-NEXT:     EMIT vp<%10> = add nuw vp<%2>, vp<%0>
+; CHECK-NEXT:     EMIT branch-on-count vp<%10>, vp<%1>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 511
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 57e1dc9051f4..b876e9d2c1a5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -1,11 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize < %s -S -o - | FileCheck %s -check-prefix=OUTLOOP
 ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize -prefer-inloop-reductions < %s -S -o - | FileCheck %s -check-prefix=INLOOP
-
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
 
+; FIXME: inloop reductions are not supported yet with predicated vectorization.
+
 define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; OUTLOOP-LABEL: @add_i16_i32(
 ; OUTLOOP-NEXT:  entry:
@@ -115,6 +117,70 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; INLOOP-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; INLOOP-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
+; IF-EVL-LABEL: @add_i16_i32(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL:       for.body.preheader:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP4]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 4
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP12]], i32 2, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i16> poison)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; IF-EVL-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+; IF-EVL-NEXT:    [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
+; IF-EVL-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup.loopexit:
+; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; IF-EVL-NEXT:    ret i32 [[R_0_LCSSA]]
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
new file mode 100644
index 000000000000..835ff3756881
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) {
+; IF-EVL-LABEL: @gather_scatter(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP11]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP13]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP16]], i64 0
+; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP17]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], <vscale x 2 x i64> [[VEC_IND]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP20]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x float> @llvm.vp.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP21]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    call void @llvm.vp.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> [[WIDE_MASKED_GATHER2]], <vscale x 2 x ptr> align 4 [[TMP22]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP18]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP10]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP25]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP25]]
+; IF-EVL-NEXT:    store float [[TMP26]], ptr [[ARRAYIDX7]], align 4
+; IF-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.end:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @gather_scatter(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; NO-VP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    store float [[TMP1]], ptr [[ARRAYIDX7]], align 4
+; NO-VP-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NO-VP:       for.end:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx3 = getelementptr inbounds i32, ptr %index, i64 %indvars.iv
+  %0 = load i64, ptr %arrayidx3, align 8
+  %arrayidx5 = getelementptr inbounds float, ptr %in, i64 %0
+  %1 = load float, ptr %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds float, ptr %out, i64 %0
+  store float %1, ptr %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
new file mode 100644
index 000000000000..0b495bc680f0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+; FIXME: interleaved accesses are not supported yet with predicated vectorization.
+define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
+; IF-EVL-LABEL: @interleave(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP31]], 8
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP17]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; IF-EVL-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP32]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i64> [[TMP11]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i64> [[TMP12]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; IF-EVL-NEXT:    [[TMP37:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0
+; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; IF-EVL-NEXT:    [[TMP38:%.*]] = add i64 [[TMP19]], 0
+; IF-EVL-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP38]], 1
+; IF-EVL-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], [[TMP39]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP25]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP26]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; IF-EVL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP27]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP28]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; IF-EVL-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 4
+; IF-EVL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP35]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP23]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP30]], ptr [[TMP36]], i32 4, <vscale x 4 x i1> [[TMP24]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
+; IF-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
+; IF-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @interleave(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP10]], i32 0
+; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP1]], i32 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; NO-VP-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
+; NO-VP-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; NO-VP-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; NO-VP-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; NO-VP-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; NO-VP-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 8
+; NO-VP-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP12]], align 4
+; NO-VP-NEXT:    store <8 x i32> [[TMP7]], ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
+; NO-VP-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
+; NO-VP-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP30]], [[TMP29]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 0
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 1
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+
+for.cond.cleanup:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.interleave.count", i32 2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
new file mode 100644
index 000000000000..d5ad99f5cff8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
+; IF-EVL-LABEL: @iv32(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP19:%.*]] = sub i32 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP19]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i32 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD]], ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i32 [[TMP12]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i32 [[IV]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV1]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV1]]
+; IF-EVL-NEXT:    store i32 [[TMP0]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @iv32(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[TMP10]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP1]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP11]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP2]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; NO-VP-NEXT:    store i32 [[TMP9]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i32 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i32 %iv
+  store i32 %0, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
new file mode 100644
index 000000000000..203d0c977074
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
+; IF-EVL-LABEL: @masked_loadstore(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], [[VP_OP_LOAD3]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP24]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP26]], 0
+; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; IF-EVL:       if.then:
+; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT:    br label [[FOR_INC]]
+; IF-EVL:       for.inc:
+; IF-EVL-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @masked_loadstore(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[I_011]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP0]], 0
+; NO-VP-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; NO-VP:       if.then:
+; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[I_011]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; NO-VP-NEXT:    br label [[FOR_INC]]
+; NO-VP:       for.inc:
+; NO-VP-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i.011 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.011
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp ne i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx3 = getelementptr inbounds i32, ptr %a, i64 %i.011
+  %1 = load i32, ptr %arrayidx3, align 4
+  %add = add i32 %0, %1
+  store i32 %add, ptr %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i64 %i.011, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll
new file mode 100644
index 000000000000..1c49fba1370e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
+
+; No need to emit predicated vector code if the vector instructions with masking are not required.
+define i32 @no_masking() {
+; CHECK-LABEL: @no_masking(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[BODY]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[P]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %body
+
+body:
+  %p = phi i32 [ 1, %entry ], [ %inc, %body ]
+  %inc = add i32 %p, 1
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %end, label %body
+
+end:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
new file mode 100644
index 000000000000..f2222e0a1f93
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+; FIXME: reversed loads/stores are not supported yet with predicated vectorization.
+define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %ptr2) {
+; IF-EVL-LABEL: @reverse_load_store(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP7]], -1
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 0, [[TMP14]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP14]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
+; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP21]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
+; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
+; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
+; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
+; IF-EVL-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
+; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       loopend:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @reverse_load_store(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[STARTVAL:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
+; NO-VP-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
+; NO-VP-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
+; NO-VP-NEXT:    [[INC]] = add i32 [[I]], 1
+; NO-VP-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
+; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
+; NO-VP:       loopend:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %add.phi = phi i64 [ %startval, %entry ], [ %add, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %add = add i64 %add.phi, -1
+  %gepl = getelementptr inbounds i32, ptr %ptr, i64 %add
+  %tmp = load i32, ptr %gepl, align 4
+  %geps = getelementptr inbounds i32, ptr %ptr2, i64 %add
+  store i32 %tmp, ptr %geps, align 4
+  %inc = add i32 %i, 1
+  %exitcond = icmp ne i32 %inc, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
new file mode 100644
index 000000000000..c69bb17f698a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
new file mode 100644
index 000000000000..72b881bd44c7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -0,0 +1,134 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL,CHECK %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP,CHECK %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%N>
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, ir<true>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, ir<true>
+; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, ir<true>
+; IF-EVL-NEXT:    SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: Live-in ir<%N> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>
+; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @safe_dep(ptr %p) {
+; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK:      vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; CHECK-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]>
+; CHECK-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; CHECK-NEXT:    WIDEN ir<[[V:%.+]]> = load vp<[[PTR1]]>
+; CHECK-NEXT:    CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100>
+; CHECK-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]>
+; CHECK-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; CHECK-NEXT:    WIDEN store vp<[[PTR2]]>, ir<[[V]]>
+; CHECK-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; CHECK-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:  No successors
+; CHECK-NEXT: }
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 511
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
new file mode 100644
index 000000000000..1cf71360adf7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  iter.check:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; NO-VP:       vector.main.loop.iter.check:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 64
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
+; NO-VP-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 32
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 48
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 32
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 48
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 16
+; NO-VP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 32
+; NO-VP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 48
+; NO-VP-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP16]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP17]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP18]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP19]], align 4
+; NO-VP-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD2]]
+; NO-VP-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
+; NO-VP-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
+; NO-VP-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 16
+; NO-VP-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 32
+; NO-VP-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 48
+; NO-VP-NEXT:    store <16 x i32> [[TMP20]], ptr [[TMP28]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP21]], ptr [[TMP29]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP22]], ptr [[TMP30]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP23]], ptr [[TMP31]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
+; NO-VP-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; NO-VP:       vec.epilog.iter.check:
+; NO-VP-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; NO-VP-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; NO-VP:       vec.epilog.ph:
+; NO-VP-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; NO-VP-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[N]], 8
+; NO-VP-NEXT:    [[N_VEC10:%.*]] = sub i64 [[N]], [[N_MOD_VF9]]
+; NO-VP-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; NO-VP:       vec.epilog.vector.body:
+; NO-VP-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX12]], 0
+; NO-VP-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP35]], align 4
+; NO-VP-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4
+; NO-VP-NEXT:    [[TMP38:%.*]] = add nsw <8 x i32> [[WIDE_LOAD14]], [[WIDE_LOAD13]]
+; NO-VP-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 0
+; NO-VP-NEXT:    store <8 x i32> [[TMP38]], ptr [[TMP40]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
+; NO-VP-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
+; NO-VP-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       vec.epilog.middle.block:
+; NO-VP-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
+; NO-VP-NEXT:    br i1 [[CMP_N11]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_SCALAR_PH]]
+; NO-VP:       vec.epilog.scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP43]], [[TMP42]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
new file mode 100644
index 000000000000..9b49d44141db
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
@@ -0,0 +1,89 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; IF-EVL-NEXT:    EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]>
+; IF-EVL-NEXT:    EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]>, vp<[[BETC]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: Live-in ir<%N> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>
+; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
new file mode 100644
index 000000000000..a90b38c6a960
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-width=4 \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=NO-VP %s
+
+; The target does not support predicated vectorization.
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP8]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP14]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
+; NO-VP-NEXT:    [[TMP16:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP16]], ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll
new file mode 100644
index 000000000000..f510d47d06e3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll
@@ -0,0 +1,37 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl -force-vector-width=4 \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on \
+; RUN: -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on \
+; RUN: -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP %s
+
+; The target does not support predicated vectorization.
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; NO-VP-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
index 495ec0a63399..45e411d73316 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
@@ -9,11 +9,7 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
-; CHECK-NEXT:    [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
diff --git a/llvm/test/Transforms/RemoveTraps/remove-traps.ll b/llvm/test/Transforms/RemoveTraps/remove-traps.ll
index 71549e7d9b41..c8d5fecbf55a 100644
--- a/llvm/test/Transforms/RemoveTraps/remove-traps.ll
+++ b/llvm/test/Transforms/RemoveTraps/remove-traps.ll
@@ -1,18 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes='function(remove-traps)' -S | FileCheck %s --check-prefixes=NOPROFILE
 ; RUN: opt < %s -passes='function(remove-traps)' -remove-traps-random-rate=1 -S | FileCheck %s --check-prefixes=ALL
-; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -S | FileCheck %s --check-prefixes=HOT
+; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -remove-traps-percentile-cutoff-hot=990000 -S | FileCheck %s --check-prefixes=HOT99
 ; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -remove-traps-percentile-cutoff-hot=700000 -S | FileCheck %s --check-prefixes=HOT70
 
 target triple = "x86_64-pc-linux-gnu"
 
 declare void @llvm.ubsantrap(i8 immarg)
+declare i1 @llvm.allow.ubsan.check(i8 immarg)
 
 define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @simple(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; NOPROFILE-NEXT:    unreachable
@@ -23,28 +26,35 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; ALL-LABEL: define dso_local noundef i32 @simple(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; ALL-NEXT:    unreachable
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @simple(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @simple(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @simple(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; HOT70-NEXT:    unreachable
@@ -52,7 +62,10 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; HOT70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
-  %2 = icmp eq ptr %0, null
+  %chk = icmp eq ptr %0, null
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
+  %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
 3:
@@ -69,7 +82,9 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @hot(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; NOPROFILE-NEXT:    unreachable
@@ -80,27 +95,35 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; ALL-LABEL: define dso_local noundef i32 @hot(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; ALL-NEXT:    unreachable
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @hot(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @hot(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @hot(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; HOT70-NEXT:    unreachable
@@ -108,7 +131,10 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; HOT70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
-  %2 = icmp eq ptr %0, null
+  %chk = icmp eq ptr %0, null
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
+  %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
 3:
@@ -124,7 +150,9 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @veryHot(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; NOPROFILE-NEXT:    unreachable
@@ -135,34 +163,46 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; ALL-LABEL: define dso_local noundef i32 @veryHot(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; ALL-NEXT:    unreachable
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @veryHot(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @veryHot(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @veryHot(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
+; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; HOT70-NEXT:    unreachable
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
-  %2 = icmp eq ptr %0, null
+  %chk = icmp eq ptr %0, null
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
+  %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
 3:
@@ -182,7 +222,9 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; NOPROFILE-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; NOPROFILE:       4:
 ; NOPROFILE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; NOPROFILE-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; NOPROFILE-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; NOPROFILE:       6:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; NOPROFILE-NEXT:    unreachable
@@ -199,8 +241,11 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; ALL-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; ALL:       6:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; ALL-NEXT:    unreachable
 ; ALL:       7:
 ; ALL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -209,22 +254,24 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; ALL-NEXT:    ret i32 [[TMP10]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @branchColdFnHot(
-; HOT-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
-; HOT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
-; HOT-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; HOT:       6:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       7:
-; HOT-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
-; HOT-NEXT:    br label [[TMP9]]
-; HOT:       9:
-; HOT-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
-; HOT-NEXT:    ret i32 [[TMP10]]
+; HOT99-LABEL: define dso_local noundef i32 @branchColdFnHot(
+; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
+; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; HOT99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT99-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; HOT99:       6:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       7:
+; HOT99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; HOT99-NEXT:    br label [[TMP9]]
+; HOT99:       9:
+; HOT99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; HOT99-NEXT:    ret i32 [[TMP10]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @branchColdFnHot(
 ; HOT70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
@@ -232,7 +279,9 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; HOT70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT70-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; HOT70-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; HOT70:       6:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; HOT70-NEXT:    unreachable
@@ -247,7 +296,10 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
   br i1 %3, label %9, label %4, !prof !38
 
 4:
-  %5 = icmp eq ptr %1, null
+  %chk = icmp eq ptr %1, null
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
+  %5 = or i1 %chk, %hot
   br i1 %5, label %6, label %7
 
 6:
@@ -270,7 +322,9 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; NOPROFILE-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; NOPROFILE:       4:
 ; NOPROFILE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; NOPROFILE-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; NOPROFILE-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; NOPROFILE:       6:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; NOPROFILE-NEXT:    unreachable
@@ -287,8 +341,11 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; ALL-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; ALL:       6:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; ALL-NEXT:    unreachable
 ; ALL:       7:
 ; ALL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -297,21 +354,24 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; ALL-NEXT:    ret i32 [[TMP10]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @branchHotFnCold(
-; HOT-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
-; HOT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
-; HOT-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; HOT:       6:
-; HOT-NEXT:    unreachable
-; HOT:       7:
-; HOT-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
-; HOT-NEXT:    br label [[TMP9]]
-; HOT:       9:
-; HOT-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
-; HOT-NEXT:    ret i32 [[TMP10]]
+; HOT99-LABEL: define dso_local noundef i32 @branchHotFnCold(
+; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
+; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; HOT99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; HOT99:       6:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       7:
+; HOT99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; HOT99-NEXT:    br label [[TMP9]]
+; HOT99:       9:
+; HOT99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; HOT99-NEXT:    ret i32 [[TMP10]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @branchHotFnCold(
 ; HOT70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
@@ -319,7 +379,9 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; HOT70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT70-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; HOT70-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; HOT70:       6:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
 ; HOT70-NEXT:    unreachable
@@ -334,7 +396,10 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
   br i1 %3, label %9, label %4, !prof !37
 
 4:
-  %5 = icmp eq ptr %1, null
+  %chk = icmp eq ptr %1, null
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
+  %5 = or i1 %chk, %hot
   br i1 %5, label %6, label %7
 
 6:
@@ -385,10 +450,10 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
 ; ALL: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
-; HOT: [[PROF16]] = !{!"function_entry_count", i64 1000}
-; HOT: [[PROF17]] = !{!"function_entry_count", i64 7000}
-; HOT: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
-; HOT: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
+; HOT99: [[PROF16]] = !{!"function_entry_count", i64 1000}
+; HOT99: [[PROF17]] = !{!"function_entry_count", i64 7000}
+; HOT99: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
+; HOT99: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
 ; HOT70: [[PROF16]] = !{!"function_entry_count", i64 1000}
 ; HOT70: [[PROF17]] = !{!"function_entry_count", i64 7000}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
new file mode 100644
index 000000000000..84f7e219f506
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@e = global i8 0
+@c = global i16 0
+@d = global i32 0
+
+define i8 @test() {
+; CHECK-LABEL: define i8 @test() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @e, align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @c, align 2
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 32769>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP8]])
+; CHECK-NEXT:    [[CONV4_30:%.*]] = trunc i32 [[TMP11]] to i8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
+; CHECK-NEXT:    [[XOR_31:%.*]] = and i32 [[TMP13]], -2
+; CHECK-NEXT:    store i32 [[XOR_31]], ptr @d, align 4
+; CHECK-NEXT:    ret i8 [[CONV4_30]]
+;
+entry:
+  %0 = load i8, ptr @e, align 1
+  %conv = sext i8 %0 to i32
+  %1 = load i16, ptr @c, align 2
+  %conv1 = zext i16 %1 to i32
+  %or.16 = or i32 %conv, 1
+  %add.16 = add nsw i32 %or.16, %conv1
+  %or.18 = or i32 %conv, 1
+  %add.18 = add nsw i32 %or.18, %conv1
+  %conv4.181 = or i32 %add.16, %add.18
+  %or.20 = or i32 %conv, 1
+  %add.20 = add nsw i32 %or.20, %conv1
+  %conv4.202 = or i32 %conv4.181, %add.20
+  %or.22 = or i32 %conv, 1
+  %add.22 = add nsw i32 %or.22, %conv1
+  %conv4.223 = or i32 %conv4.202, %add.22
+  %or.24 = or i32 %conv, 1
+  %add.24 = add nsw i32 %or.24, %conv1
+  %conv4.244 = or i32 %conv4.223, %add.24
+  %or.26 = or i32 %conv, 1
+  %add.26 = add nsw i32 %or.26, %conv1
+  %conv4.265 = or i32 %conv4.244, %add.26
+  %or.28 = or i32 %conv, 1
+  %add.28 = add nsw i32 %or.28, %conv1
+  %conv4.286 = or i32 %conv4.265, %add.28
+  %or.30 = or i32 %conv, 32769
+  %add.30 = add nsw i32 %or.30, %conv1
+  %conv4.307 = or i32 %conv4.286, %add.30
+  %conv4.30 = trunc i32 %conv4.307 to i8
+  %xor.31 = and i32 %or.30, -2
+  store i32 %xor.31, ptr @d, align 4
+  ret i8 %conv4.30
+}
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
index f804300b1022..2031c2d04c60 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -6,9 +6,8 @@
 
 define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: @concat_zext_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = zext <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = zext <8 x i16> [[A1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = zext <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = zext <8 x i16> %a0 to <8 x i32>
@@ -19,9 +18,8 @@ define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 
 define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: @concat_zext_nneg_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = zext nneg <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = zext nneg <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = zext nneg <8 x i16> %a0 to <8 x i32>
@@ -30,13 +28,17 @@ define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
   ret <16 x i32> %r
 }
 
-; TODO - sext + zext nneg -> sext
 define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
-; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <16 x i32> [[R]]
+; SSE-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
+; SSE-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
+; SSE-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32>
+; SSE-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i32> [[R]]
+;
+; AVX-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; AVX-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = sext <8 x i16> %a0 to <8 x i32>
   %x1 = zext nneg <8 x i16> %a1 to <8 x i32>
@@ -46,9 +48,8 @@ define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a
 
 define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: @concat_sext_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = sext <8 x i16> [[A1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = sext <8 x i16> %a0 to <8 x i32>
@@ -59,9 +60,8 @@ define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 
 define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) {
 ; CHECK-LABEL: @concat_sext_v4i1_v8i32(
-; CHECK-NEXT:    [[X0:%.*]] = sext <4 x i1> [[A0:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = sext <4 x i1> [[A1:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %x0 = sext <4 x i1> %a0 to <4 x i32>
@@ -72,9 +72,8 @@ define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) {
 
 define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @concat_trunc_v4i32_v8i16(
-; CHECK-NEXT:    [[X0:%.*]] = trunc <4 x i32> [[A0:%.*]] to <4 x i16>
-; CHECK-NEXT:    [[X1:%.*]] = trunc <4 x i32> [[A1:%.*]] to <4 x i16>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[X0]], <4 x i16> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %x0 = trunc <4 x i32> %a0 to <4 x i16>
@@ -85,9 +84,8 @@ define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) {
 
 define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @concat_inttoptr_v4i32_v8iptr(
-; CHECK-NEXT:    [[X0:%.*]] = inttoptr <4 x i32> [[A0:%.*]] to <4 x ptr>
-; CHECK-NEXT:    [[X1:%.*]] = inttoptr <4 x i32> [[A1:%.*]] to <4 x ptr>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x ptr> [[X0]], <4 x ptr> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = inttoptr <8 x i32> [[TMP1]] to <8 x ptr>
 ; CHECK-NEXT:    ret <8 x ptr> [[R]]
 ;
   %x0 = inttoptr <4 x i32> %a0 to <4 x ptr>
@@ -98,9 +96,8 @@ define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) {
 
 define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) {
 ; CHECK-LABEL: @concat_ptrtoint_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = ptrtoint <8 x ptr> [[A0:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[X1:%.*]] = ptrtoint <8 x ptr> [[A1:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i64> [[X0]], <8 x i64> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[A0:%.*]], <8 x ptr> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint <16 x ptr> [[TMP1]] to <16 x i64>
 ; CHECK-NEXT:    ret <16 x i64> [[R]]
 ;
   %x0 = ptrtoint <8 x ptr> %a0 to <8 x i64>
@@ -110,11 +107,16 @@ define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) {
 }
 
 define <8 x double> @concat_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: @concat_fpext_v4f32_v8f64(
-; CHECK-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double>
-; CHECK-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x double> [[R]]
+; SSE-LABEL: @concat_fpext_v4f32_v8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
+; SSE-NEXT:    ret <8 x double> [[R]]
+;
+; AVX-LABEL: @concat_fpext_v4f32_v8f64(
+; AVX-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double>
+; AVX-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x double> [[R]]
 ;
   %x0 = fpext <4 x float> %a0 to <4 x double>
   %x1 = fpext <4 x float> %a1 to <4 x double>
@@ -139,9 +141,8 @@ define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double>
 
 define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: @rconcat_sext_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = sext <8 x i16> [[A1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = sext <8 x i16> %a0 to <8 x i32>
@@ -154,9 +155,8 @@ define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 
 define <8 x double> @interleave_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: @interleave_fpext_v4f32_v8f64(
-; CHECK-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double>
-; CHECK-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
 ; CHECK-NEXT:    ret <8 x double> [[R]]
 ;
   %x0 = fpext <4 x float> %a0 to <4 x double>
@@ -226,6 +226,3 @@ define <16 x i32> @concat_sext_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
   %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i32> %r
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll b/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll
new file mode 100644
index 000000000000..435073d01c8e
--- /dev/null
+++ b/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll
@@ -0,0 +1,19 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+; CHECK: either both or no 'aarch64-elf-pauthabi-platform' and 'aarch64-elf-pauthabi-version' module flags must be present
+
+;--- err1.ll
+
+; RUN: not llvm-as err1.ll -o /dev/null 2>&1 | FileCheck %s
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 2}
+
+;--- err2.ll
+
+; RUN: not llvm-as err2.ll -o /dev/null 2>&1 | FileCheck %s
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 31}
diff --git a/llvm/test/Verifier/pr69428.ll b/llvm/test/Verifier/pr69428.ll
new file mode 100644
index 000000000000..be8733bea1ab
--- /dev/null
+++ b/llvm/test/Verifier/pr69428.ll
@@ -0,0 +1,48 @@
+; RUN: llvm-as -disable-output %s
+
+%struct._List_node_emplace_op2 = type { i8 }
+
+@"?_List@@3HA" = global i32 0, align 4
+
+define void @"?ExecutionEngineaddExecutableDependency@@YAXXZ"() personality ptr @__CxxFrameHandler3 {
+entry:
+  %agg.tmp.ensured.i = alloca %struct._List_node_emplace_op2, align 1
+  %0 = load i32, ptr @"?_List@@3HA", align 4
+  %call.i = call noundef ptr @"??0?$_List_node_emplace_op2@H@@QEAA@H@Z"(ptr %agg.tmp.ensured.i, i32 %0)
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont.i unwind label %ehcleanup.i
+
+invoke.cont.i:                                    ; preds = %entry
+  invoke void @llvm.seh.scope.end()
+          to label %invoke.cont2.i unwind label %ehcleanup.i
+
+invoke.cont2.i:                                   ; preds = %invoke.cont.i
+  call void @"??1?$_List_node_emplace_op2@H@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6
+  unreachable
+
+ehcleanup.i:                                      ; preds = %invoke.cont.i, %entry
+  %1 = cleanuppad within none []
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont.i.i unwind label %ehcleanup.i.i
+
+invoke.cont.i.i:                                  ; preds = %ehcleanup.i
+  invoke void @llvm.seh.scope.end()
+          to label %"??1?$_List_node_emplace_op2@H@@QEAA@XZ.exit.i" unwind label %ehcleanup.i.i
+
+ehcleanup.i.i:                                    ; preds = %invoke.cont.i.i, %ehcleanup.i
+  %2 = cleanuppad within %1 []
+  call void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6 [ "funclet"(token %2) ]
+  cleanupret from %2 unwind to caller
+
+"??1?$_List_node_emplace_op2@H@@QEAA@XZ.exit.i":  ; preds = %invoke.cont.i.i
+  call void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6 [ "funclet"(token %1) ]
+  cleanupret from %1 unwind to caller
+}
+
+declare i32 @__CxxFrameHandler3(...)
+declare void @llvm.seh.scope.begin()
+declare void @llvm.seh.scope.end()
+
+declare void @"??1?$_List_node_emplace_op2@H@@QEAA@XZ"(ptr)
+declare void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr)
+declare ptr @"??0?$_List_node_emplace_op2@H@@QEAA@H@Z"(ptr, i32)
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
index ca1faf62aa89..1b196b4355a6 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
@@ -1632,17 +1632,17 @@ vzeroupper
 # CHECK-NEXT:  4      17    2.00    *                   vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrsqrtps	(%rax), %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     257.00 216.25 235.25 176.17 176.17 38.00  424.25 3.25   12.67
+# CHECK-NEXT:  -     257.00 216.25 247.25 173.17 173.17 38.00  424.25 3.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2342,17 +2342,17 @@ vzeroupper
 # CHECK-NEXT:  -      -     2.33   0.33   0.50   0.50    -     0.33    -      -     vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrsqrtps	(%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
index dcc535306c85..4865121308bb 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
@@ -243,13 +243,13 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      15    2.00    *                   pmulld	(%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        ptest	%xmm0, %xmm1
 # CHECK-NEXT:  3      7     1.00    *                   ptest	(%rax), %xmm1
-# CHECK-NEXT:  1      6     0.50                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundss	$1, (%rax), %xmm2
 
 # CHECK:      Resources:
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     23.83  22.33  25.67  25.67  5.00   80.33  0.50   1.67
+# CHECK-NEXT:  -      -     23.83  30.33  23.67  23.67  5.00   80.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -358,11 +358,11 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00    -     0.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     ptest	%xmm0, %xmm1
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -     ptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundss	$1, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
index cff60c9ce3ab..05c476079c0f 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
@@ -1632,17 +1632,17 @@ vzeroupper
 # CHECK-NEXT:  4      18    2.00    *                   vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      13    2.00    *                   vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      13    2.00    *                   vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  2      11    1.00    *                   vrsqrtps	(%rax), %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     336.00 215.58 236.58 176.17 176.17 38.00  427.58 3.25   12.67
+# CHECK-NEXT:  -     336.00 215.58 248.58 173.17 173.17 38.00  427.58 3.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2342,17 +2342,17 @@ vzeroupper
 # CHECK-NEXT:  -      -     2.33   0.33   0.50   0.50    -     0.33    -      -     vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrsqrtps	(%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
index c2d07735f1cb..62dfa23a6bad 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
@@ -243,13 +243,13 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      16    2.00    *                   pmulld	(%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        ptest	%xmm0, %xmm1
 # CHECK-NEXT:  3      8     1.00    *                   ptest	(%rax), %xmm1
-# CHECK-NEXT:  1      6     0.50                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundss	$1, (%rax), %xmm2
 
 # CHECK:      Resources:
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     23.83  22.33  25.67  25.67  5.00   80.33  0.50   1.67
+# CHECK-NEXT:  -      -     23.83  30.33  23.67  23.67  5.00   80.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -358,11 +358,11 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00    -     0.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     ptest	%xmm0, %xmm1
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -     ptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundss	$1, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
index 4654ce10dffd..349abec66457 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
@@ -68,12 +68,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -145,12 +145,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -223,12 +223,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -306,12 +306,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -389,12 +389,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -472,12 +472,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
index 12d6f399d429..0fcd6f507432 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
@@ -46,12 +46,12 @@ add %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -122,12 +122,12 @@ add %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
index 93f8d7616cb9..cd427bb5912e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
@@ -41,12 +41,12 @@ mulxq %rax, %rax, %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -112,12 +112,12 @@ mulxq %rax, %rax, %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
index 13ef5bcb11ca..bf82486cf737 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
@@ -43,12 +43,12 @@ mulxq (%rdi), %rax, %rdx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -115,12 +115,12 @@ mulxq (%rdi), %rax, %rdx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
index bfe8be85086f..8a5a0148cf58 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
@@ -44,12 +44,12 @@ mulxq %rax, %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -116,12 +116,12 @@ mulxq %rax, %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
index 1431875ade93..f0e16a8d8b99 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
@@ -68,12 +68,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
index eb2bb979b7f5..97f6a34acac1 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
@@ -68,12 +68,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
index 5909af853dd5..c733f639819b 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
@@ -63,12 +63,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -154,12 +154,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -245,12 +245,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
index 5a05487f217a..63df99e56f8c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
@@ -68,12 +68,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
index 7ac674c5a6b5..66c1322fa790 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
@@ -40,12 +40,12 @@ xor %bx, %dx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
index 582da14211d0..4ed529ee1446 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
@@ -40,12 +40,12 @@ add %cx, %bx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
index dda87e9ebc92..58941116d31f 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
@@ -33,12 +33,12 @@ lzcnt %ax, %bx  ## partial register stall.
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
index 71520ea1ce4b..fdbf4d99fbf6 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
@@ -42,12 +42,12 @@ lzcnt 2(%rsp), %cx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
index 7afa80cb0c9a..f3e515ce7e21 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
@@ -180,12 +180,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
index 8b81d55ca5db..a484a7562fdc 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
@@ -180,12 +180,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
index f359048e5298..eb20d13e8c94 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
@@ -134,12 +134,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -402,12 +402,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -670,12 +670,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -938,12 +938,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
index b556fd61f077..e17d6717e136 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
@@ -61,12 +61,12 @@ movq %mm7, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
index 147cb0fca285..b45fd172bd55 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
@@ -180,12 +180,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
index de59edf2352e..0465d413a854 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
@@ -67,12 +67,12 @@ fxch %st(0)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
index 4e024e5846f1..9c5a19b1a0a2 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
@@ -38,12 +38,12 @@ adox        (%rbx), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
index 5abf3cca9211..d1086961682d 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
@@ -50,12 +50,12 @@ aeskeygenassist $22, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
index 146b3ce62964..4f0b4843d170 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
@@ -1731,12 +1731,12 @@ vzeroupper
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
index 3c6b31a1ca01..1a8b9e2de1d8 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
@@ -771,12 +771,12 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
index 8c0e84135750..26002373e9f5 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
@@ -85,12 +85,12 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
index 8d00c99982b0..0664c1dc7eed 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
@@ -100,12 +100,12 @@ shrx        %rax, (%rbx), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
index 3e7219c9c6d0..b40d155e13f5 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
@@ -23,12 +23,12 @@ clflushopt (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
index 0dc89faad77c..0f9935c5ba14 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
@@ -23,12 +23,12 @@ clzero
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
index e0e46afd0c0b..8118e40ca5be 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
@@ -218,12 +218,12 @@ cmovgq    (%rax), %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
index 03763e5a2bfe..9ab877636b29 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
@@ -25,12 +25,12 @@ cmpxchg16b (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
index bb995d588c43..345ae02b3002 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
@@ -40,12 +40,12 @@ vcvtps2ph   $0, %ymm0, (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
index 9af180d8cb5c..af207f029c82 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
@@ -500,12 +500,12 @@ vfnmsub231ss (%rax), %xmm1, %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
index 142508c4477e..3e651837c2bd 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
@@ -40,12 +40,12 @@ wrgsbase %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
index 1545a228d9c5..02572026003c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
@@ -293,12 +293,12 @@ lea 1024(%rax, %rbx, 2), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
index ffbe414bac6c..735287a6f564 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
@@ -35,12 +35,12 @@ lzcntq      (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
index 75dbf95f4caa..2bc6177c8031 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
@@ -279,12 +279,12 @@ pxor        (%rax), %mm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
index 144e97fbaf58..6eeabbd2d7d4 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
@@ -35,12 +35,12 @@ movbe  (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
index 3b343d733425..103fd3ebf239 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
@@ -25,12 +25,12 @@ mwaitx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
index 2d9f0e9b14e2..893f47612f2f 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
@@ -25,12 +25,12 @@ pclmulqdq     $11, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
index cce078f2a8d8..29bcc5c62b4c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
@@ -35,12 +35,12 @@ popcntq     (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
index 5423b6bcff1f..b80e8f7f8b02 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
@@ -25,12 +25,12 @@ prefetchw   (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
index fb0925364449..649eb10c22af 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
@@ -27,12 +27,12 @@ rdrand   %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
index f10a90ff9bc4..44e0eebe4a2a 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
@@ -27,12 +27,12 @@ rdseed   %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
index 360a667edfaa..e6d5ab90a2ac 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
@@ -55,12 +55,12 @@ sha256rnds2 (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
index 9816b87df462..4c7a3f06d762 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
@@ -328,12 +328,12 @@ xorps       (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
index f69c535385af..d24aebfdf675 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
@@ -684,12 +684,12 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
index 8110390219c7..51bb95feb091 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
@@ -74,12 +74,12 @@ mwait
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
index 0cc6c6a5c25e..e952a169db8a 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
@@ -261,12 +261,12 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
index 873e4f406426..8afcd809b67c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
@@ -70,12 +70,12 @@ pcmpgtq     (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
index 1c1b0b24222f..6606a3ef2b2b 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
@@ -35,12 +35,12 @@ movntss     %xmm0, (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
index aeec49351a11..66688700404e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
@@ -180,12 +180,12 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
index 076094ffe195..81afc7df85ef 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
@@ -40,12 +40,12 @@ vaesenclast      (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
index 31680d562a42..10440e904b01 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
@@ -25,12 +25,12 @@ vpclmulqdq    $11, (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
index fb09b650840e..8f627ca18771 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
@@ -56,12 +56,12 @@ salc
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
index fedb3d242c45..41ec631dc3fb 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
@@ -1957,12 +1957,12 @@ xorq (%rax), %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
index 9a92bd0f2e7d..cd8a06a57f78 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
@@ -364,12 +364,12 @@ fyl2xp1
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
index 819361c0a025..f348ff8696f6 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
@@ -35,12 +35,12 @@ xsetbv
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
index 33657e6b66b8..ed4e8f90dd7b 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
@@ -138,12 +138,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -229,12 +229,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -320,12 +320,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -411,12 +411,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -502,12 +502,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -593,12 +593,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -684,12 +684,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -775,12 +775,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -866,12 +866,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -957,12 +957,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1048,12 +1048,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1139,12 +1139,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1230,12 +1230,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1321,12 +1321,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1412,12 +1412,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1503,12 +1503,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1594,12 +1594,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1685,12 +1685,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
index ba7f51eb245a..24043369c6f1 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
@@ -148,12 +148,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -239,12 +239,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -330,12 +330,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -421,12 +421,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -512,12 +512,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -603,12 +603,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -694,12 +694,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -785,12 +785,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -876,12 +876,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -967,12 +967,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1058,12 +1058,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1149,12 +1149,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1240,12 +1240,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1331,12 +1331,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1422,12 +1422,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1513,12 +1513,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1604,12 +1604,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1695,12 +1695,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1786,12 +1786,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1878,12 +1878,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
index 018adc261b08..4d648f7fb9de 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
@@ -68,12 +68,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -175,12 +175,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -282,12 +282,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -389,12 +389,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
index 935881a91421..aca39c52f36e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
@@ -138,12 +138,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -229,12 +229,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -320,12 +320,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -411,12 +411,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -502,12 +502,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -593,12 +593,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -684,12 +684,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -775,12 +775,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -866,12 +866,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -957,12 +957,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1048,12 +1048,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1139,12 +1139,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1230,12 +1230,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1321,12 +1321,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1412,12 +1412,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1503,12 +1503,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1594,12 +1594,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1685,12 +1685,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s b/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s
new file mode 100644
index 000000000000..064ffcadc12e
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s
@@ -0,0 +1,38 @@
+## Disallow (de)compression for sections within a segment as they are
+## effectively immutable.
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: yaml2obj %s -o a
+# RUN: not llvm-objcopy a /dev/null --compress-sections .text=zlib 2>&1 | FileCheck %s --implicit-check-not=error:
+
+# CHECK: error: 'a': section '.text' within a segment cannot be (de)compressed
+
+# RUN: not llvm-objcopy a /dev/null --compress-sections foo=none 2>&1 | FileCheck %s --check-prefix=CHECK2 --implicit-check-not=error:
+
+# CHECK2: error: 'a': section 'foo' within a segment cannot be (de)compressed
+
+## There is an error even if 'foo' is already compressed with zlib.
+# RUN: not llvm-objcopy a /dev/null --compress-sections foo=zlib 2>&1 | FileCheck %s --check-prefix=CHECK3 --implicit-check-not=error:
+
+# CHECK3: error: 'a': section 'foo' within a segment cannot be (de)compressed
+
+--- !ELF
+FileHeader:
+  Class:      ELFCLASS64
+  Data:       ELFDATA2LSB
+  Type:       ET_EXEC
+  Machine:    EM_X86_64
+ProgramHeaders:
+  - Type:     PT_LOAD
+    FirstSec: .text
+    LastSec:  foo
+    Align:    0x1000
+    Offset:   0x1000
+Sections:
+  - Name:     .text
+    Type:     SHT_PROGBITS
+    Offset:   0x1000
+    Content:  C3
+  - Name:     foo
+    Type:     SHT_PROGBITS
+    Flags:    [ SHF_COMPRESSED ]
+    Content:  010000000000000040000000000000000100000000000000789cd36280002d3269002f800151
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s b/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s
new file mode 100644
index 000000000000..e6fa86068a1a
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s
@@ -0,0 +1,128 @@
+# REQUIRES: x86-registered-target, zlib, zstd
+
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o
+## '*0=none' wins because it is the last. '*0' sections are decompressed (if originally compressed) or kept unchanged (if uncompressed).
+## No section is named 'nomatch'. The third option is a no-op.
+# RUN: llvm-objcopy a.o out --compress-sections='*0=zlib' --compress-sections '*0=none' --compress-sections 'nomatch=none' 2>&1 | count 0
+# RUN: llvm-readelf -S out | FileCheck %s --check-prefix=CHECK1
+
+# CHECK1:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK1:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
+# CHECK1:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK1-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   3  8
+# CHECK1-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK1-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   5  8
+# CHECK1:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK1-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   7  8
+# CHECK1-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK1-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
+
+## Mixing zlib and zstd.
+# RUN: llvm-objcopy a.o out2 --compress-sections '*c0=zlib' --compress-sections .debug_str=zstd
+# RUN: llvm-readelf -Sr -x nonalloc0 -x .debug_str out2 2>&1 | FileCheck %s --check-prefix=CHECK2
+# RUN: llvm-readelf -z -x nonalloc0 -x .debug_str out2 | FileCheck %s --check-prefix=CHECK2DE
+
+# CHECK2:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK2:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
+# CHECK2:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK2-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   3  8
+# CHECK2-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK2-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   5  8
+# CHECK2:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  8
+# CHECK2-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   7  8
+# CHECK2-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK2-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MSC  0   0  8
+
+## llvm-readelf -r doesn't support SHF_COMPRESSED SHT_RELA.
+# CHECK2: warning: {{.*}}: unable to read relocations from SHT_RELA section with index 8: section [index 8] has an invalid sh_size ([[#]]) which is not a multiple of its sh_entsize (24)
+
+# CHECK2:      Hex dump of section 'nonalloc0':
+## zlib with ch_size=0x10
+# CHECK2-NEXT: 01000000 00000000 10000000 00000000
+# CHECK2-NEXT: 08000000 00000000 {{.*}}
+# CHECK2:      Hex dump of section '.debug_str':
+## zstd with ch_size=0x38
+# CHECK2-NEXT: 02000000 00000000 38000000 00000000
+# CHECK2-NEXT: 01000000 00000000 {{.*}}
+
+# CHECK2DE:       Hex dump of section 'nonalloc0':
+# CHECK2DE-NEXT:  0x00000000 00000000 00000000 00000000 00000000 ................
+# CHECK2DE-EMPTY:
+# CHECK2DE-NEXT:  Hex dump of section '.debug_str':
+# CHECK2DE-NEXT:  0x00000000 41414141 41414141 41414141 41414141 AAAAAAAAAAAAAAAA
+
+## --decompress-debug-sections takes precedence, even if it is before --compress-sections.
+# RUN: llvm-objcopy a.o out3 --decompress-debug-sections --compress-sections .debug_str=zstd
+# RUN: llvm-readelf -S out3 | FileCheck %s --check-prefix=CHECK3
+
+# CHECK3:      .debug_str PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
+
+# RUN: llvm-objcopy a.o out4 --compress-sections '*0=zlib'
+# RUN: llvm-readelf -S out4 | FileCheck %s --check-prefix=CHECK4
+
+# CHECK4:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK4:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00  AX  0   0  4
+# CHECK4:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00  AC  0   0  8
+# CHECK4-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   3  8
+# CHECK4-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00   A  0   0  8
+# CHECK4-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18   I 11   5  8
+# CHECK4:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  8
+# CHECK4-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   7  8
+# CHECK4-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK4-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01  MS  0   0  1
+
+## If a section is already compressed, compression request for another format is ignored.
+# RUN: llvm-objcopy a.o out5 --compress-sections 'nonalloc0=zlib'
+# RUN: llvm-readelf -x nonalloc0 out5 | FileCheck %s --check-prefix=CHECK5
+# RUN: llvm-objcopy out5 out5a --compress-sections 'nonalloc0=zstd'
+# RUN: cmp out5 out5a
+
+# CHECK5:      Hex dump of section 'nonalloc0':
+## zlib with ch_size=0x10
+# CHECK5-NEXT: 01000000 00000000 10000000 00000000
+# CHECK5-NEXT: 08000000 00000000 {{.*}}
+
+# RUN: not llvm-objcopy --compress-sections=foo a.o out 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR1 --implicit-check-not=error:
+# ERR1:      error: --compress-sections: parse error, not 'section-glob=[none|zlib|zstd]'
+
+# RUN: llvm-objcopy --compress-sections 'a[=zlib' a.o out 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR2 --implicit-check-not=error:
+# ERR2:      warning: invalid glob pattern, unmatched '['
+
+# RUN: not llvm-objcopy a.o out --compress-sections='.debug*=zlib-gabi' --compress-sections='.debug*=' 2>&1 | \
+# RUN:   FileCheck -check-prefix=ERR3 %s
+# ERR3:      error: invalid or unsupported --compress-sections format: .debug*=zlib-gabi
+
+# RUN: not llvm-objcopy a.o out --compress-sections='!.debug*=zlib' 2>&1 | \
+# RUN:   FileCheck -check-prefix=ERR4 %s
+# ERR4:      error: --compress-sections: negative pattern is unsupported
+
+.globl _start
+_start:
+  ret
+
+.section foo0,"a"
+.balign 8
+.quad .text-.
+.quad .text-.
+.section foo1,"a"
+.balign 8
+.quad .text-.
+.quad .text-.
+.section nonalloc0,""
+.balign 8
+.quad .text+1
+.quad .text+2
+sym0:
+.section nonalloc1,""
+.balign 8
+.quad 42
+sym1:
+
+.section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+  .asciz "AAAAAAAAAAAAAAAAAAAAAAAAAAA"
+.Linfo_string1:
+  .asciz "BBBBBBBBBBBBBBBBBBBBBBBBBBB"
diff --git a/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test b/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
index 4258ddbe66a3..d9f4f3809c4d 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
@@ -4,6 +4,8 @@
 # RUN: yaml2obj %s -o %t
 # RUN: llvm-objcopy --decompress-debug-sections %t %t.de
 # RUN: llvm-readelf -S %t.de | FileCheck %s
+# RUN: llvm-objcopy --compress-sections '*nonalloc=none' --compress-sections .debugx=none %t %t.1.de
+# RUN: cmp %t.de %t.1.de
 
 # CHECK:        Name              Type            Address          Off      Size     ES Flg Lk Inf Al
 # CHECK:        .debug_alloc      PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00  AC  0   0  0
@@ -11,6 +13,33 @@
 # CHECK-NEXT:   .debugx           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
 # CHECK-NEXT:   nodebug           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  0
 
+# RUN: llvm-objcopy --compress-sections '.debug*=none' %t %t2.de
+# RUN: llvm-readelf -S -x .debug_alloc -x .debug_nonalloc -x .debugx %t2.de | FileCheck %s --check-prefix=CHECK2
+
+# CHECK2:        Name              Type            Address          Off      Size     ES Flg Lk Inf Al
+# CHECK2:        .debug_alloc      PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00  A   0   0  1
+# CHECK2-NEXT:   .debug_nonalloc   PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
+# CHECK2-NEXT:   .debugx           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
+# CHECK2-NEXT:   nodebug           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  0
+
+# CHECK2:       Hex dump of section '.debug_alloc':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-EMPTY:
+# CHECK2:       Hex dump of section '.debug_nonalloc':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-EMPTY:
+# CHECK2-NEXT:  Hex dump of section '.debugx':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64
diff --git a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
index f28d92eae857..512531748cd2 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
+++ b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
@@ -1,98 +1,204 @@
 # RUN: rm -rf %t && split-file %s %t && cd %t
 
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag.s       -o tag.o
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-short.s -o tag-short.o
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-long.s  -o tag-long.o
-
-# RUN: llvm-readelf --notes tag.o       | FileCheck --check-prefix NORMAL %s
-# RUN: llvm-readelf --notes tag-short.o | FileCheck --check-prefix SHORT  %s
-# RUN: llvm-readelf --notes tag-long.o  | FileCheck --check-prefix LONG   %s
-
-# NORMAL: AArch64 PAuth ABI tag: platform 0x2a, version 0x1
-# SHORT:  AArch64 PAuth ABI tag: <corrupted size: expected at least 16, got 12>
-# LONG:   AArch64 PAuth ABI tag: platform 0x2a, version 0x1, additional info 0xEFCDAB8967452301
-
-# RUN: llvm-readobj --notes tag.o       | FileCheck --check-prefix LLVM-NORMAL %s
-# RUN: llvm-readobj --notes tag-short.o | FileCheck --check-prefix LLVM-SHORT %s
-# RUN: llvm-readobj --notes tag-long.o  | FileCheck --check-prefix LLVM-LONG %s
-
-# LLVM-SHORT:      Notes [
-# LLVM-SHORT-NEXT:   NoteSection {
-# LLVM-SHORT-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-SHORT-NEXT:     Offset: 0x40
-# LLVM-SHORT-NEXT:     Size: 0x1C
-# LLVM-SHORT-NEXT:     Note {
-# LLVM-SHORT-NEXT:       Owner: ARM
-# LLVM-SHORT-NEXT:       Data size: 0xC
-# LLVM-SHORT-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-SHORT-NEXT:       Description data (
-# LLVM-SHORT-NEXT:         0000: 2A000000 00000000 01000000
-# LLVM-SHORT-NEXT:       )
-# LLVM-SHORT-NEXT:     }
-# LLVM-SHORT-NEXT:   }
-# LLVM-SHORT-NEXT: ]
-
-# LLVM-NORMAL:      Notes [
-# LLVM-NORMAL-NEXT:   NoteSection {
-# LLVM-NORMAL-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-NORMAL-NEXT:     Offset: 0x40
-# LLVM-NORMAL-NEXT:     Size: 0x20
-# LLVM-NORMAL-NEXT:     Note {
-# LLVM-NORMAL-NEXT:       Owner: ARM
-# LLVM-NORMAL-NEXT:       Data size: 0x10
-# LLVM-NORMAL-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-NORMAL-NEXT:       Platform: 42
-# LLVM-NORMAL-NEXT:       Version: 1
-# LLVM-NORMAL-NEXT:     }
-# LLVM-NORMAL-NEXT:   }
-# LLVM-NORMAL-NEXT: ]
-
-# LLVM-LONG:      Notes [
-# LLVM-LONG-NEXT:   NoteSection {
-# LLVM-LONG-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-LONG-NEXT:     Offset: 0x40
-# LLVM-LONG-NEXT:     Size: 0x28
-# LLVM-LONG-NEXT:     Note {
-# LLVM-LONG-NEXT:       Owner: ARM
-# LLVM-LONG-NEXT:       Data size: 0x18
-# LLVM-LONG-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-LONG-NEXT:       Platform: 42
-# LLVM-LONG-NEXT:       Version: 1
-# LLVM-LONG-NEXT:       Additional info: EFCDAB8967452301
-# LLVM-LONG-NEXT:     }
-# LLVM-LONG-NEXT:   }
-# LLVM-LONG-NEXT: ]
-
-#--- abi-tag.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 16
-.long 1
-.asciz "ARM"
-
-.quad 42         // platform
-.quad 1          // version
-
-#--- abi-tag-short.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 12
-.long 1
-.asciz "ARM"
-
-.quad 42
-.word 1
-
-#--- abi-tag-long.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 24
-.long 1
-.asciz "ARM"
-
-.quad 42         // platform
-.quad 1          // version
-.quad 0x0123456789ABCDEF // extra data
+#--- gnu-42-1.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 42          // PAuth ABI platform
+  .quad 1           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-42-1.s -o gnu-42-1.o
+# RUN: llvm-readelf --notes gnu-42-1.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x2a (unknown)" -DVERSION=0x1 %s
+# RUN: llvm-readobj --notes gnu-42-1.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x2a (unknown)" -DVERSION=0x1 %s
+
+# ELF: Displaying notes found in: .note.gnu.property
+# ELF-NEXT:   Owner                 Data size	Description
+# ELF-NEXT:   GNU                   0x00000018	NT_GNU_PROPERTY_TYPE_0 (property note)
+# ELF-NEXT:   AArch64 PAuth ABI core info: platform [[PLATFORM]], version [[VERSION]]
+
+# OBJ:      Notes [
+# OBJ-NEXT:   NoteSection {
+# OBJ-NEXT:     Name: .note.gnu.property
+# OBJ-NEXT:     Offset: 0x40
+# OBJ-NEXT:     Size: 0x28
+# OBJ-NEXT:     Note {
+# OBJ-NEXT:       Owner: GNU
+# OBJ-NEXT:       Data size: 0x18
+# OBJ-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+# OBJ-NEXT:       Property [
+# OBJ-NEXT:         AArch64 PAuth ABI core info: platform [[PLATFORM]], version [[VERSION]]
+# OBJ-NEXT:       ]
+# OBJ-NEXT:     }
+# OBJ-NEXT:   }
+# OBJ-NEXT: ]
+
+#--- gnu-0-0.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0           // PAuth ABI platform
+  .quad 0           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0-0.s -o gnu-0-0.o
+# RUN: llvm-readelf --notes gnu-0-0.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x0 (invalid)" -DVERSION=0x0 %s
+# RUN: llvm-readobj --notes gnu-0-0.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x0 (invalid)" -DVERSION=0x0 %s
+
+#--- gnu-1-0.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 1           // PAuth ABI platform
+  .quad 0           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-1-0.s -o gnu-1-0.o
+# RUN: llvm-readelf --notes gnu-1-0.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x1 (baremetal)" -DVERSION=0x0 %s
+# RUN: llvm-readobj --notes gnu-1-0.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x1 (baremetal)" -DVERSION=0x0 %s
+
+#--- gnu-0x10000002-85.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0x10000002  // PAuth ABI platform
+  .quad 85          // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0x10000002-85.s -o gnu-0x10000002-85.o
+# RUN: llvm-readelf --notes gnu-0x10000002-85.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x10000002 (llvm_linux)" \
+# RUN:   -DVERSION="0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)" %s
+# RUN: llvm-readobj --notes gnu-0x10000002-85.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x10000002 (llvm_linux)" \
+# RUN:   -DVERSION="0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)" %s
+
+#--- gnu-0x10000002-128.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0x10000002  // PAuth ABI platform
+  .quad 128         // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0x10000002-128.s -o gnu-0x10000002-128.o
+# RUN: llvm-readelf --notes gnu-0x10000002-128.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x10000002 (llvm_linux)" -DVERSION="0x80 (unknown)" %s
+# RUN: llvm-readobj --notes gnu-0x10000002-128.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x10000002 (llvm_linux)" -DVERSION="0x80 (unknown)" %s
+
+#--- gnu-short.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 12          // Data size
+  .quad 42          // PAuth ABI platform
+  .word 1           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-short.s -o gnu-short.o
+# RUN: llvm-readelf --notes gnu-short.o | \
+# RUN:   FileCheck --check-prefix=ELF-ERR -DSIZE=28 -DDATASIZE=18 \
+# RUN:   -DERR="<corrupted size: expected 16, got 12>" %s
+# RUN: llvm-readobj --notes gnu-short.o | \
+# RUN:   FileCheck --check-prefix=OBJ-ERR -DSIZE=28 -DDATASIZE=18 \
+# RUN:   -DERR="<corrupted size: expected 16, got 12>" %s
+
+# ELF-ERR: Displaying notes found in: .note.gnu.property
+# ELF-ERR-NEXT:   Owner                 Data size	Description
+# ELF-ERR-NEXT:   GNU                   0x000000[[DATASIZE]]	NT_GNU_PROPERTY_TYPE_0 (property note)
+# ELF-ERR-NEXT:   AArch64 PAuth ABI core info: [[ERR]]
+
+# OBJ-ERR:      Notes [
+# OBJ-ERR-NEXT:   NoteSection {
+# OBJ-ERR-NEXT:     Name: .note.gnu.property
+# OBJ-ERR-NEXT:     Offset: 0x40
+# OBJ-ERR-NEXT:     Size: 0x[[SIZE]]
+# OBJ-ERR-NEXT:     Note {
+# OBJ-ERR-NEXT:       Owner: GNU
+# OBJ-ERR-NEXT:       Data size: 0x[[DATASIZE]]
+# OBJ-ERR-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+# OBJ-ERR-NEXT:       Property [
+# OBJ-ERR-NEXT:         AArch64 PAuth ABI core info: [[ERR]]
+# OBJ-ERR-NEXT:       ]
+# OBJ-ERR-NEXT:     }
+# OBJ-ERR-NEXT:   }
+# OBJ-ERR-NEXT: ]
+
+#--- gnu-long.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 24          // Data size
+  .quad 42          // PAuth ABI platform
+  .quad 1           // PAuth ABI version
+  .quad 0x0123456789ABCDEF
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-long.s -o gnu-long.o
+# RUN: llvm-readelf --notes gnu-long.o | \
+# RUN:   FileCheck --check-prefix=ELF-ERR -DSIZE=30 -DDATASIZE=20 \
+# RUN:   -DERR="<corrupted size: expected 16, got 24>" %s
+# RUN: llvm-readobj --notes gnu-long.o | \
+# RUN:   FileCheck --check-prefix=OBJ-ERR -DSIZE=30 -DDATASIZE=20 \
+# RUN:   -DERR="<corrupted size: expected 16, got 24>" %s
diff --git a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
index 377e6f93448c..b517f0b38155 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
+++ b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
@@ -1,3 +1,5 @@
+// See tests for GNU_PROPERTY_AARCH64_FEATURE_PAUTH in aarch64-feature-pauth.s
+
 // RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu %s -o %t
 // RUN: llvm-readelf --notes %t | FileCheck %s --check-prefix=GNU
 // RUN: llvm-readobj --notes %t | FileCheck %s --check-prefix=LLVM
diff --git a/llvm/tools/gold/CMakeLists.txt b/llvm/tools/gold/CMakeLists.txt
index 58b323805c7d..5c78529e38f4 100644
--- a/llvm/tools/gold/CMakeLists.txt
+++ b/llvm/tools/gold/CMakeLists.txt
@@ -12,7 +12,7 @@ if( LLVM_ENABLE_PIC AND LLVM_BINUTILS_INCDIR )
      TargetParser
      )
 
-  add_llvm_library(LLVMgold MODULE
+  add_llvm_library(LLVMgold MODULE INSTALL_WITH_TOOLCHAIN
     gold-plugin.cpp
     )
 
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 49154dc46c57..6ad1c99568e4 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -258,12 +258,8 @@ int main(int argc, char **argv) {
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
         if (M) {
-          bool ChangeDbgFormat = M->IsNewDbgInfoFormat != WriteNewDbgInfoFormat;
-          if (ChangeDbgFormat)
-            M->setIsNewDbgInfoFormat(WriteNewDbgInfoFormat);
+          ScopedDbgInfoFormatSetter FormatSetter(*M, WriteNewDbgInfoFormat);
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
-          if (ChangeDbgFormat)
-            M->setIsNewDbgInfoFormat(!WriteNewDbgInfoFormat);
         }
         if (Index)
           Index->print(Out->os());
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 7269c51a08d6..70e85460d3df 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -736,6 +736,42 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
       return createStringError(errc::invalid_argument, Reason);
   }
 
+  for (const auto *A : InputArgs.filtered(OBJCOPY_compress_sections)) {
+    SmallVector<StringRef, 0> Fields;
+    StringRef(A->getValue()).split(Fields, '=');
+    if (Fields.size() != 2 || Fields[1].empty()) {
+      return createStringError(
+          errc::invalid_argument,
+          A->getSpelling() +
+              ": parse error, not 'section-glob=[none|zlib|zstd]'");
+    }
+
+    auto Type = StringSwitch<DebugCompressionType>(Fields[1])
+                    .Case("zlib", DebugCompressionType::Zlib)
+                    .Case("zstd", DebugCompressionType::Zstd)
+                    .Default(DebugCompressionType::None);
+    if (Type == DebugCompressionType::None && Fields[1] != "none") {
+      return createStringError(
+          errc::invalid_argument,
+          "invalid or unsupported --compress-sections format: %s",
+          A->getValue());
+    }
+
+    auto &P = Config.compressSections.emplace_back();
+    P.second = Type;
+    auto Matcher =
+        NameOrPattern::create(Fields[0], SectionMatchStyle, ErrorCallback);
+    // =none allows overriding a previous =zlib or =zstd. Reject negative
+    // patterns, which would be confusing.
+    if (Matcher && !Matcher->isPositiveMatch()) {
+      return createStringError(
+          errc::invalid_argument,
+          "--compress-sections: negative pattern is unsupported");
+    }
+    if (Error E = P.first.addMatcher(std::move(Matcher)))
+      return std::move(E);
+  }
+
   Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
   // The gnu_debuglink's target is expected to not change or else its CRC would
   // become invalidated and get rejected. We can avoid recalculating the
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index be02616e8c68..4bc80eba05f8 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -35,6 +35,12 @@ def : Flag<["--"], "compress-debug-sections">, Alias<compress_debug_sections>,
       AliasArgs<["zlib"]>;
 def decompress_debug_sections : Flag<["--"], "decompress-debug-sections">,
                                 HelpText<"Decompress DWARF debug sections">;
+defm compress_sections
+    : Eq<"compress-sections",
+         "Compress or decompress sections using specified format. Supported "
+         "formats: zlib, zstd. Specify 'none' for decompression">,
+      MetaVarName<"<section-glob>=<format>">;
+
 defm split_dwo
     : Eq<"split-dwo", "Equivalent to --extract-dwo and <dwo-file> as the output file and no other options, "
                       "and then --strip-dwo on the input file">,
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 4b406ef12aec..d3534820c117 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -61,6 +61,7 @@
 #include "llvm/Support/SystemZ/zOSSupport.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <array>
 #include <cinttypes>
 #include <cstddef>
 #include <cstdint>
@@ -5105,6 +5106,73 @@ template <class ELFT> void GNUELFDumper<ELFT>::printAddrsig() {
   }
 }
 
+template <class ELFT>
+static bool printAArch64PAuthABICoreInfo(raw_ostream &OS, uint32_t DataSize,
+                                         ArrayRef<uint8_t> Desc) {
+  OS << "    AArch64 PAuth ABI core info: ";
+  // DataSize - size without padding, Desc.size() - size with padding
+  if (DataSize != 16) {
+    OS << format("<corrupted size: expected 16, got %d>", DataSize);
+    return false;
+  }
+
+  uint64_t Platform =
+      support::endian::read64<ELFT::Endianness>(Desc.data() + 0);
+  uint64_t Version = support::endian::read64<ELFT::Endianness>(Desc.data() + 8);
+
+  const char *PlatformDesc = [Platform]() {
+    switch (Platform) {
+    case AARCH64_PAUTH_PLATFORM_INVALID:
+      return "invalid";
+    case AARCH64_PAUTH_PLATFORM_BAREMETAL:
+      return "baremetal";
+    case AARCH64_PAUTH_PLATFORM_LLVM_LINUX:
+      return "llvm_linux";
+    default:
+      return "unknown";
+    }
+  }();
+
+  std::string VersionDesc = [Platform, Version]() -> std::string {
+    if (Platform != AARCH64_PAUTH_PLATFORM_LLVM_LINUX)
+      return "";
+    if (Version >= (1 << (AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST + 1)))
+      return "unknown";
+
+    std::array<StringRef, AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST + 1>
+        Flags;
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INTRINSICS] = "Intrinsics";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_CALLS] = "Calls";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_RETURNS] = "Returns";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_AUTHTRAPS] = "AuthTraps";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRADDRDISCR] =
+        "VTPtrAddressDiscrimination";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRTYPEDISCR] =
+        "VTPtrTypeDiscrimination";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI] = "InitFini";
+
+    static_assert(AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI ==
+                      AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST,
+                  "Update when new enum items are defined");
+
+    std::string Desc;
+    for (uint32_t I = 0, End = Flags.size(); I < End; ++I) {
+      if (!(Version & (1ULL << I)))
+        Desc += '!';
+      Desc +=
+          Twine("PointerAuth" + Flags[I] + (I == End - 1 ? "" : ", ")).str();
+    }
+    return Desc;
+  }();
+
+  OS << format("platform 0x%" PRIx64 " (%s), version 0x%" PRIx64, Platform,
+               PlatformDesc, Version);
+  if (!VersionDesc.empty())
+    OS << format(" (%s)", VersionDesc.c_str());
+
+  return true;
+}
+
 template <typename ELFT>
 static std::string getGNUProperty(uint32_t Type, uint32_t DataSize,
                                   ArrayRef<uint8_t> Data) {
@@ -5162,6 +5230,9 @@ static std::string getGNUProperty(uint32_t Type, uint32_t DataSize,
     if (PrData)
       OS << format("<unknown flags: 0x%x>", PrData);
     return OS.str();
+  case GNU_PROPERTY_AARCH64_FEATURE_PAUTH:
+    printAArch64PAuthABICoreInfo<ELFT>(OS, DataSize, Data);
+    return OS.str();
   case GNU_PROPERTY_X86_FEATURE_2_NEEDED:
   case GNU_PROPERTY_X86_FEATURE_2_USED:
     OS << "x86 feature "
@@ -5364,29 +5435,6 @@ static bool printAndroidNote(raw_ostream &OS, uint32_t NoteType,
 }
 
 template <class ELFT>
-static bool printAArch64Note(raw_ostream &OS, uint32_t NoteType,
-                             ArrayRef<uint8_t> Desc) {
-  if (NoteType != NT_ARM_TYPE_PAUTH_ABI_TAG)
-    return false;
-
-  OS << "    AArch64 PAuth ABI tag: ";
-  if (Desc.size() < 16) {
-    OS << format("<corrupted size: expected at least 16, got %d>", Desc.size());
-    return false;
-  }
-
-  uint64_t Platform = endian::read64<ELFT::Endianness>(Desc.data() + 0);
-  uint64_t Version = endian::read64<ELFT::Endianness>(Desc.data() + 8);
-  OS << format("platform 0x%" PRIx64 ", version 0x%" PRIx64, Platform, Version);
-
-  if (Desc.size() > 16)
-    OS << ", additional info 0x"
-       << toHex(ArrayRef<uint8_t>(Desc.data() + 16, Desc.size() - 16));
-
-  return true;
-}
-
-template <class ELFT>
 void GNUELFDumper<ELFT>::printMemtag(
     const ArrayRef<std::pair<std::string, std::string>> DynamicEntries,
     const ArrayRef<uint8_t> AndroidNoteDesc,
@@ -5783,10 +5831,6 @@ const NoteType AndroidNoteTypes[] = {
      "NT_ANDROID_TYPE_MEMTAG (Android memory tagging information)"},
 };
 
-const NoteType ARMNoteTypes[] = {
-    {ELF::NT_ARM_TYPE_PAUTH_ABI_TAG, "NT_ARM_TYPE_PAUTH_ABI_TAG"},
-};
-
 const NoteType CoreNoteTypes[] = {
     {ELF::NT_PRSTATUS, "NT_PRSTATUS (prstatus structure)"},
     {ELF::NT_FPREGSET, "NT_FPREGSET (floating point registers)"},
@@ -5905,8 +5949,6 @@ StringRef getNoteTypeName(const typename ELFT::Note &Note, unsigned ELFType) {
     return FindNote(LLVMOMPOFFLOADNoteTypes);
   if (Name == "Android")
     return FindNote(AndroidNoteTypes);
-  if (Name == "ARM")
-    return FindNote(ARMNoteTypes);
 
   if (ELFType == ELF::ET_CORE)
     return FindNote(CoreNoteTypes);
@@ -6062,9 +6104,6 @@ template <class ELFT> void GNUELFDumper<ELFT>::printNotes() {
     } else if (Name == "Android") {
       if (printAndroidNote(OS, Type, Descriptor))
         return Error::success();
-    } else if (Name == "ARM") {
-      if (printAArch64Note<ELFT>(OS, Type, Descriptor))
-        return Error::success();
     }
     if (!Descriptor.empty()) {
       OS << "   description data:";
@@ -7703,27 +7742,6 @@ static bool printAndroidNoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
 }
 
 template <class ELFT>
-static bool printAarch64NoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
-                                      ScopedPrinter &W) {
-  if (NoteType != NT_ARM_TYPE_PAUTH_ABI_TAG)
-    return false;
-
-  if (Desc.size() < 16)
-    return false;
-
-  uint64_t platform = endian::read64<ELFT::Endianness>(Desc.data() + 0);
-  uint64_t version = endian::read64<ELFT::Endianness>(Desc.data() + 8);
-  W.printNumber("Platform", platform);
-  W.printNumber("Version", version);
-
-  if (Desc.size() > 16)
-    W.printString("Additional info",
-                  toHex(ArrayRef<uint8_t>(Desc.data() + 16, Desc.size() - 16)));
-
-  return true;
-}
-
-template <class ELFT>
 void LLVMELFDumper<ELFT>::printMemtag(
     const ArrayRef<std::pair<std::string, std::string>> DynamicEntries,
     const ArrayRef<uint8_t> AndroidNoteDesc,
@@ -7859,9 +7877,6 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printNotes() {
     } else if (Name == "Android") {
       if (printAndroidNoteLLVMStyle(Type, Descriptor, W))
         return Error::success();
-    } else if (Name == "ARM") {
-      if (printAarch64NoteLLVMStyle<ELFT>(Type, Descriptor, W))
-        return Error::success();
     }
     if (!Descriptor.empty()) {
       W.printBinaryBlock("Description data", Descriptor);
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index d5ef63e38e27..76fc26412407 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -3249,21 +3249,11 @@ TEST(APIntTest, SolveQuadraticEquationWrap) {
 }
 
 TEST(APIntTest, MultiplicativeInverseExaustive) {
-  for (unsigned BitWidth = 1; BitWidth <= 16; ++BitWidth) {
-    for (unsigned Value = 0; Value < (1u << BitWidth); ++Value) {
+  for (unsigned BitWidth = 1; BitWidth <= 8; ++BitWidth) {
+    for (unsigned Value = 1; Value < (1u << BitWidth); Value += 2) {
+      // Multiplicative inverse exists for all odd numbers.
       APInt V = APInt(BitWidth, Value);
-      APInt MulInv =
-          V.zext(BitWidth + 1)
-              .multiplicativeInverse(APInt::getSignedMinValue(BitWidth + 1))
-              .trunc(BitWidth);
-      APInt One = V * MulInv;
-      if (!V.isZero() && V.countr_zero() == 0) {
-        // Multiplicative inverse exists for all odd numbers.
-        EXPECT_TRUE(One.isOne());
-      } else {
-        // Multiplicative inverse does not exist for even numbers (and 0).
-        EXPECT_TRUE(MulInv.isZero());
-      }
+      EXPECT_EQ(V * V.multiplicativeInverse(), 1);
     }
   }
 }
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 1cca44e9b037..f1aa6f37aa39 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -265,7 +265,9 @@ TEST(MemProf, PortableWrapper) {
   EXPECT_EQ(3UL, ReadBlock.getAllocCpuId());
 }
 
-TEST(MemProf, RecordSerializationRoundTrip) {
+// Version0 and Version1 serialize IndexedMemProfRecord in the same format, so
+// we share one test.
+TEST(MemProf, RecordSerializationRoundTripVersion0And1) {
   const MemProfSchema Schema = getFullSchema();
 
   MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
@@ -284,14 +286,47 @@ TEST(MemProf, RecordSerializationRoundTrip) {
                                    Info);
   }
   Record.CallSites.assign(CallSites);
+  for (const auto &CS : CallSites)
+    Record.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS));
 
   std::string Buffer;
   llvm::raw_string_ostream OS(Buffer);
-  Record.serialize(Schema, OS);
+  Record.serialize(Schema, OS, llvm::memprof::Version0);
   OS.flush();
 
   const IndexedMemProfRecord GotRecord = IndexedMemProfRecord::deserialize(
-      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()));
+      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()),
+      llvm::memprof::Version0);
+
+  EXPECT_EQ(Record, GotRecord);
+}
+
+TEST(MemProf, RecordSerializationRoundTripVerion2) {
+  const MemProfSchema Schema = getFullSchema();
+
+  MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
+                    /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
+                    /*dealloc_cpu=*/4);
+
+  llvm::SmallVector<llvm::memprof::CallStackId> CallStackIds = {0x123, 0x456};
+
+  llvm::SmallVector<llvm::memprof::CallStackId> CallSiteIds = {0x333, 0x444};
+
+  IndexedMemProfRecord Record;
+  for (const auto &CSId : CallStackIds) {
+    // Use the same info block for both allocation sites.
+    Record.AllocSites.emplace_back(llvm::SmallVector<FrameId>(), CSId, Info);
+  }
+  Record.CallSiteIds.assign(CallSiteIds);
+
+  std::string Buffer;
+  llvm::raw_string_ostream OS(Buffer);
+  Record.serialize(Schema, OS, llvm::memprof::Version2);
+  OS.flush();
+
+  const IndexedMemProfRecord GotRecord = IndexedMemProfRecord::deserialize(
+      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()),
+      llvm::memprof::Version2);
 
   EXPECT_EQ(Record, GotRecord);
 }
diff --git a/llvm/unittests/TextAPI/TextStubV5Tests.cpp b/llvm/unittests/TextAPI/TextStubV5Tests.cpp
index c77d13ef8f23..62fdd79ae497 100644
--- a/llvm/unittests/TextAPI/TextStubV5Tests.cpp
+++ b/llvm/unittests/TextAPI/TextStubV5Tests.cpp
@@ -722,7 +722,7 @@ TEST(TBDv5, WriteFile) {
   File.setInstallName("@rpath/S/L/F/Foo.framework/Foo");
   File.setCurrentVersion(PackedVersion(1, 2, 0));
   File.setCompatibilityVersion(PackedVersion(1, 1, 0));
-  File.addRPath(AllTargets[0], "@executable_path/.../Frameworks");
+  File.addRPath("@executable_path/.../Frameworks", AllTargets[0]);
 
   for (const auto &Targ : AllTargets) {
     File.addParentUmbrella(Targ, "System");
@@ -897,7 +897,7 @@ TEST(TBDv5, WriteMultipleDocuments) {
   NestedFile.setTwoLevelNamespace();
   NestedFile.setApplicationExtensionSafe(false);
   NestedFile.setCurrentVersion(PackedVersion(2, 1, 1));
-  NestedFile.addRPath(AllTargets[0], "@executable_path/.../Frameworks");
+  NestedFile.addRPath("@executable_path/.../Frameworks", AllTargets[0]);
   for (const auto &Targ : AllTargets)
     NestedFile.addReexportedLibrary("@rpath/libfoo.dylib", Targ);
   NestedFile.addSymbol(EncodeKind::GlobalSymbol, "_funcFoo", AllTargets,
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 076d0427a859..7a5d2be3ae95 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -3858,8 +3858,10 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
   for (unsigned i = NumResults, e = CGI.Operands.size(); i != e; ++i) {
     CGIOperandList::OperandInfo &Op = CGI.Operands[i];
     const std::string &OpName = Op.Name;
-    if (OpName.empty())
+    if (OpName.empty()) {
       I.error("Operand #" + Twine(i) + " in operands list has no name!");
+      continue;
+    }
 
     if (!InstInputs.count(OpName)) {
       // If this is an operand with a DefaultOps set filled in, we can ignore
@@ -3872,16 +3874,19 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
       }
       I.error("Operand $" + OpName +
               " does not appear in the instruction pattern");
+      continue;
     }
     TreePatternNodePtr InVal = InstInputs[OpName];
     InstInputs.erase(OpName); // It occurred, remove from map.
 
     if (InVal->isLeaf() && isa<DefInit>(InVal->getLeafValue())) {
       Record *InRec = cast<DefInit>(InVal->getLeafValue())->getDef();
-      if (!checkOperandClass(Op, InRec))
+      if (!checkOperandClass(Op, InRec)) {
         I.error("Operand $" + OpName +
                 "'s register class disagrees"
                 " between the operand and pattern");
+        continue;
+      }
     }
     Operands.push_back(Op.Rec);
 
diff --git a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
index 77cf65be6842..665a394f57a6 100644
--- a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
+++ b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
@@ -197,12 +197,12 @@ ENTRY(VPUNPCKLQDQZ128rm, VPUNPCKLQDQrm)
 ENTRY(VPUNPCKLQDQZ128rr, VPUNPCKLQDQrr)
 ENTRY(VPXORQZ128rm, VPXORrm)
 ENTRY(VPXORQZ128rr, VPXORrr)
-ENTRY(VRNDSCALEPDZ128rmi, VROUNDPDm)
-ENTRY(VRNDSCALEPDZ128rri, VROUNDPDr)
-ENTRY(VRNDSCALESDZm, VROUNDSDm)
-ENTRY(VRNDSCALESDZm_Int, VROUNDSDm_Int)
-ENTRY(VRNDSCALESDZr, VROUNDSDr)
-ENTRY(VRNDSCALESDZr_Int, VROUNDSDr_Int)
+ENTRY(VRNDSCALEPDZ128rmi, VROUNDPDmi)
+ENTRY(VRNDSCALEPDZ128rri, VROUNDPDri)
+ENTRY(VRNDSCALESDZm, VROUNDSDmi)
+ENTRY(VRNDSCALESDZm_Int, VROUNDSDmi_Int)
+ENTRY(VRNDSCALESDZr, VROUNDSDri)
+ENTRY(VRNDSCALESDZr_Int, VROUNDSDri_Int)
 ENTRY(VSHUFPDZ128rmi, VSHUFPDrmi)
 ENTRY(VSHUFPDZ128rri, VSHUFPDrri)
 ENTRY(VSQRTPDZ128m, VSQRTPDm)
@@ -306,8 +306,8 @@ ENTRY(VPUNPCKLQDQZ256rm, VPUNPCKLQDQYrm)
 ENTRY(VPUNPCKLQDQZ256rr, VPUNPCKLQDQYrr)
 ENTRY(VPXORQZ256rm, VPXORYrm)
 ENTRY(VPXORQZ256rr, VPXORYrr)
-ENTRY(VRNDSCALEPDZ256rmi, VROUNDPDYm)
-ENTRY(VRNDSCALEPDZ256rri, VROUNDPDYr)
+ENTRY(VRNDSCALEPDZ256rmi, VROUNDPDYmi)
+ENTRY(VRNDSCALEPDZ256rri, VROUNDPDYri)
 ENTRY(VSHUFPDZ256rmi, VSHUFPDYrmi)
 ENTRY(VSHUFPDZ256rri, VSHUFPDYrri)
 ENTRY(VSQRTPDZ256m, VSQRTPDYm)
diff --git a/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn
index 5fead24afb45..dc85fb064121 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn
@@ -12,7 +12,6 @@ static_library("Profile") {
     "DataAggregator.cpp",
     "DataReader.cpp",
     "Heatmap.cpp",
-    "ProfileReaderBase.cpp",
     "StaleProfileMatching.cpp",
     "YAMLProfileReader.cpp",
     "YAMLProfileWriter.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
index 33fdecfd948e..59dc38c8c4d8 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
@@ -25,6 +25,7 @@ static_library("readability") {
     "DeleteNullPointerCheck.cpp",
     "DuplicateIncludeCheck.cpp",
     "ElseAfterReturnCheck.cpp",
+    "EnumInitialValueCheck.cpp",
     "FunctionCognitiveComplexityCheck.cpp",
     "FunctionSizeCheck.cpp",
     "IdentifierLengthCheck.cpp",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 8a2ab18bf953..78a6e6fda826 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -352,6 +352,7 @@ if (current_toolchain == default_toolchain) {
       "__chrono/formatter.h",
       "__chrono/hh_mm_ss.h",
       "__chrono/high_resolution_clock.h",
+      "__chrono/leap_second.h",
       "__chrono/literals.h",
       "__chrono/month.h",
       "__chrono/month_weekday.h",
@@ -778,12 +779,12 @@ if (current_toolchain == default_toolchain) {
       "__tree",
       "__tuple/find_index.h",
       "__tuple/make_tuple_types.h",
-      "__tuple/pair_like.h",
       "__tuple/sfinae_helpers.h",
       "__tuple/tuple_element.h",
       "__tuple/tuple_indices.h",
       "__tuple/tuple_like.h",
       "__tuple/tuple_like_ext.h",
+      "__tuple/tuple_like_no_subrange.h",
       "__tuple/tuple_size.h",
       "__tuple/tuple_types.h",
       "__type_traits/add_const.h",
diff --git a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
index 55309720725b..90f6f5d0f145 100644
--- a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
@@ -315,6 +315,7 @@ if (libcxx_enable_experimental) {
     sources = [ "experimental/keep.cpp" ]
     if (libcxx_enable_filesystem && libcxx_enable_time_zone_database) {
       sources += [
+        "include/tzdb/leap_second_private.h",
         "include/tzdb/time_zone_link_private.h",
         "include/tzdb/time_zone_private.h",
         "include/tzdb/types_private.h",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
index 15766d4a7e34..fba81184d992 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
@@ -17,6 +17,7 @@ static_library("GlobalISel") {
     "CallLowering.cpp",
     "Combiner.cpp",
     "CombinerHelper.cpp",
+    "CombinerHelperVectorOps.cpp",
     "GIMatchTableExecutor.cpp",
     "GISelChangeObserver.cpp",
     "GISelKnownBits.cpp",
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 723a262f24ac..d143954b78fc 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -299,53 +299,8 @@ LogicalResult
 separateFullTiles(MutableArrayRef<AffineForOp> nest,
                   SmallVectorImpl<AffineForOp> *fullTileNest = nullptr);
 
-/// Walk either an scf.for or an affine.for to find a band to coalesce.
-template <typename LoopOpTy>
-LogicalResult coalescePerfectlyNestedLoops(LoopOpTy op) {
-  LogicalResult result(failure());
-  SmallVector<LoopOpTy> loops;
-  getPerfectlyNestedLoops(loops, op);
-
-  // Look for a band of loops that can be coalesced, i.e. perfectly nested
-  // loops with bounds defined above some loop.
-  // 1. For each loop, find above which parent loop its operands are
-  // defined.
-  SmallVector<unsigned, 4> operandsDefinedAbove(loops.size());
-  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
-    operandsDefinedAbove[i] = i;
-    for (unsigned j = 0; j < i; ++j) {
-      if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
-        operandsDefinedAbove[i] = j;
-        break;
-      }
-    }
-  }
-
-  // 2. Identify bands of loops such that the operands of all of them are
-  // defined above the first loop in the band.  Traverse the nest bottom-up
-  // so that modifications don't invalidate the inner loops.
-  for (unsigned end = loops.size(); end > 0; --end) {
-    unsigned start = 0;
-    for (; start < end - 1; ++start) {
-      auto maxPos =
-          *std::max_element(std::next(operandsDefinedAbove.begin(), start),
-                            std::next(operandsDefinedAbove.begin(), end));
-      if (maxPos > start)
-        continue;
-      assert(maxPos == start &&
-             "expected loop bounds to be known at the start of the band");
-      auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
-      if (succeeded(coalesceLoops(band)))
-        result = success();
-      break;
-    }
-    // If a band was found and transformed, keep looking at the loops above
-    // the outermost transformed loop.
-    if (start != end - 1)
-      end = start + 1;
-  }
-  return result;
-}
+/// Walk an affine.for to find a band to coalesce.
+LogicalResult coalescePerfectlyNestedAffineLoops(AffineForOp op);
 
 } // namespace affine
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index 28526f1a1560..a52cca3c95de 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -562,8 +562,10 @@ class LLVM_DbgIntrOp<string name, string argName, list<Trait> traits = []>
   }];
 }
 
-def LLVM_DbgDeclareOp : LLVM_DbgIntrOp<"dbg.declare", "addr",
-    [DeclareOpInterfaceMethods<PromotableOpInterface>]> {
+def LLVM_DbgDeclareOp : LLVM_DbgIntrOp<"dbg.declare", "addr", [
+    DeclareOpInterfaceMethods<PromotableOpInterface, [
+      "requiresReplacedValues", "visitReplacedValues"
+    ]>]> {
   let summary = "Describes how the address relates to a source language variable.";
   let arguments = (ins
     LLVM_AnyPointer:$addr,
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index f33942b3c7c0..457451886e14 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -2135,8 +2135,8 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
   let summary = "declares a reduction kind";
 
   let description = [{
-    Declares an OpenMP reduction kind. This requires two mandatory and one
-    optional region.
+    Declares an OpenMP reduction kind. This requires two mandatory and two
+    optional regions.
 
       1. The initializer region specifies how to initialize the thread-local
          reduction value. This is usually the neutral element of the reduction.
@@ -2149,6 +2149,10 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
       3. The atomic reduction region is optional and specifies how two values
          can be combined atomically given local accumulator variables. It is
          expected to store the combined value in the first accumulator variable.
+      4. The cleanup region is optional and specifies how to clean up any memory
+         allocated by the initializer region. The region has an argument that
+         contains the value of the thread-local reduction accumulator. This will
+         be executed after the reduction has completed.
 
     Note that the MLIR type system does not allow for type-polymorphic
     reductions. Separate reduction declarations should be created for different
@@ -2163,12 +2167,14 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
 
   let regions = (region AnyRegion:$initializerRegion,
                         AnyRegion:$reductionRegion,
-                        AnyRegion:$atomicReductionRegion);
+                        AnyRegion:$atomicReductionRegion,
+                        AnyRegion:$cleanupRegion);
 
   let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword "
                        "`init` $initializerRegion "
                        "`combiner` $reductionRegion "
-                       "custom<AtomicReductionRegion>($atomicReductionRegion)";
+                       "custom<AtomicReductionRegion>($atomicReductionRegion) "
+                       "custom<CleanupReductionRegion>($cleanupRegion)";
 
   let extraClassDeclaration = [{
     PointerLikeType getAccumulatorType() {
diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
index 883d11bcc4df..bc09cc7f7fa5 100644
--- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
@@ -100,11 +100,16 @@ getSCFMinMaxExpr(Value value, SmallVectorImpl<Value> &dims,
 /// `loops` contains a list of perfectly nested loops with bounds and steps
 /// independent of any loop induction variable involved in the nest.
 LogicalResult coalesceLoops(MutableArrayRef<scf::ForOp> loops);
+LogicalResult coalesceLoops(RewriterBase &rewriter,
+                            MutableArrayRef<scf::ForOp>);
+
+/// Walk an affine.for to find a band to coalesce.
+LogicalResult coalescePerfectlyNestedSCFForLoops(scf::ForOp op);
 
 /// Take the ParallelLoop and for each set of dimension indices, combine them
 /// into a single dimension. combinedDimensions must contain each index into
 /// loops exactly once.
-void collapseParallelLoops(scf::ParallelOp loops,
+void collapseParallelLoops(RewriterBase &rewriter, scf::ParallelOp loops,
                            ArrayRef<std::vector<unsigned>> combinedDimensions);
 
 /// Unrolls this for operation by the specified unroll factor. Returns failure
diff --git a/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h b/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h
index 31e19ff1ad39..67a6581eb2fb 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h
+++ b/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h
@@ -29,9 +29,12 @@ struct ValueBoundsConstraintSet : protected ::mlir::ValueBoundsConstraintSet {
 struct ScalableValueBoundsConstraintSet
     : public llvm::RTTIExtends<ScalableValueBoundsConstraintSet,
                                detail::ValueBoundsConstraintSet> {
-  ScalableValueBoundsConstraintSet(MLIRContext *context, unsigned vscaleMin,
-                                   unsigned vscaleMax)
-      : RTTIExtends(context), vscaleMin(vscaleMin), vscaleMax(vscaleMax){};
+  ScalableValueBoundsConstraintSet(
+      MLIRContext *context,
+      ValueBoundsConstraintSet::StopConditionFn stopCondition,
+      unsigned vscaleMin, unsigned vscaleMax)
+      : RTTIExtends(context, stopCondition), vscaleMin(vscaleMin),
+        vscaleMax(vscaleMax) {};
 
   using RTTIExtends::bound;
   using RTTIExtends::StopConditionFn;
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 15b1c3892948..2562301e499d 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -15,6 +15,7 @@
 #include "llvm/Support/TypeName.h"
 #include <optional>
 
+using llvm::SmallPtrSetImpl;
 namespace mlir {
 
 class PatternRewriter;
@@ -704,6 +705,8 @@ public:
       return user != exceptedUser;
     });
   }
+  void replaceAllUsesExcept(Value from, Value to,
+                            const SmallPtrSetImpl<Operation *> &preservedUsers);
 
   /// Used to notify the listener that the IR failed to be rewritten because of
   /// a match failure, and provide a callback to populate a diagnostic with the
diff --git a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
index e10e2d4e104c..9db89361c780 100644
--- a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
+++ b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
@@ -229,6 +229,36 @@ def PromotableOpInterface : OpInterface<"PromotableOpInterface"> {
       (ins "const ::llvm::SmallPtrSetImpl<mlir::OpOperand *> &":$blockingUses,
            "::mlir::RewriterBase &":$rewriter)
     >,
+    InterfaceMethod<[{
+        This method allows the promoted operation to visit the SSA values used
+        in place of the memory slot once the promotion process of the memory
+        slot is complete.
+
+        If this method returns true, the `visitReplacedValues` method on this
+        operation will be called after the main mutation stage finishes
+        (i.e., after all ops have been processed with `removeBlockingUses`).
+
+        Operations should only the replaced values if the intended
+        transformation applies to all the replaced values. Furthermore, replaced
+        values must not be deleted.
+      }], "bool", "requiresReplacedValues", (ins), [{}],
+      [{ return false; }]
+    >,
+    InterfaceMethod<[{
+        Transforms the IR using the SSA values that replaced the memory slot.
+
+        This method will only be called after all blocking uses have been
+        scheduled for removal and if `requiresReplacedValues` returned
+        true.
+
+        The rewriter is located after the promotable operation on call. All IR
+        mutations must happen through the rewriter. During the transformation,
+        *no operation should be deleted*.
+      }],
+      "void", "visitReplacedValues",
+      (ins "::llvm::ArrayRef<std::pair<::mlir::Operation*, ::mlir::Value>>":$mutatedDefs,
+           "::mlir::RewriterBase &":$rewriter), [{}], [{ return; }]
+    >,
   ];
 }
 
diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
index bdfd689c7ac4..83107a3f5f94 100644
--- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
@@ -117,8 +117,9 @@ public:
   ///
   /// The first parameter of the function is the shaped value/index-typed
   /// value. The second parameter is the dimension in case of a shaped value.
-  using StopConditionFn =
-      function_ref<bool(Value, std::optional<int64_t> /*dim*/)>;
+  /// The third parameter is this constraint set.
+  using StopConditionFn = std::function<bool(
+      Value, std::optional<int64_t> /*dim*/, ValueBoundsConstraintSet &cstr)>;
 
   /// Compute a bound for the given index-typed value or shape dimension size.
   /// The computed bound is stored in `resultMap`. The operands of the bound are
@@ -271,22 +272,20 @@ protected:
   /// An index-typed value or the dimension of a shaped-type value.
   using ValueDim = std::pair<Value, int64_t>;
 
-  ValueBoundsConstraintSet(MLIRContext *ctx);
+  ValueBoundsConstraintSet(MLIRContext *ctx, StopConditionFn stopCondition);
 
   /// Populates the constraint set for a value/map without actually computing
   /// the bound. Returns the position for the value/map (via the return value
   /// and `posOut` output parameter).
   int64_t populateConstraintsSet(Value value,
-                                 std::optional<int64_t> dim = std::nullopt,
-                                 StopConditionFn stopCondition = nullptr);
+                                 std::optional<int64_t> dim = std::nullopt);
   int64_t populateConstraintsSet(AffineMap map, ValueDimList mapOperands,
-                                 StopConditionFn stopCondition = nullptr,
                                  int64_t *posOut = nullptr);
 
   /// Iteratively process all elements on the worklist until an index-typed
   /// value or shaped value meets `stopCondition`. Such values are not processed
   /// any further.
-  void processWorklist(StopConditionFn stopCondition);
+  void processWorklist();
 
   /// Bound the given column in the underlying constraint set by the given
   /// expression.
@@ -333,6 +332,9 @@ protected:
 
   /// Builder for constructing affine expressions.
   Builder builder;
+
+  /// The current stop condition function.
+  StopConditionFn stopCondition = nullptr;
 };
 
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp
index 1dc69ab493d4..05c77070a70c 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopCoalescing.cpp
@@ -39,9 +39,9 @@ struct LoopCoalescingPass
     func::FuncOp func = getOperation();
     func.walk<WalkOrder::PreOrder>([](Operation *op) {
       if (auto scfForOp = dyn_cast<scf::ForOp>(op))
-        (void)coalescePerfectlyNestedLoops(scfForOp);
+        (void)coalescePerfectlyNestedSCFForLoops(scfForOp);
       else if (auto affineForOp = dyn_cast<AffineForOp>(op))
-        (void)coalescePerfectlyNestedLoops(affineForOp);
+        (void)coalescePerfectlyNestedAffineLoops(affineForOp);
     });
   }
 };
diff --git a/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
index 37b36f76d446..117ee8e8701a 100644
--- a/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
@@ -84,7 +84,8 @@ FailureOr<OpFoldResult> mlir::affine::reifyShapedValueDimBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
     int64_t dim, ValueBoundsConstraintSet::StopConditionFn stopCondition,
     bool closedUB) {
-  auto reifyToOperands = [&](Value v, std::optional<int64_t> d) {
+  auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
+                             ValueBoundsConstraintSet &cstr) {
     // We are trying to reify a bound for `value` in terms of the owning op's
     // operands. Construct a stop condition that evaluates to "true" for any SSA
     // value except for `value`. I.e., the bound will be computed in terms of
@@ -100,7 +101,8 @@ FailureOr<OpFoldResult> mlir::affine::reifyShapedValueDimBound(
 FailureOr<OpFoldResult> mlir::affine::reifyIndexValueBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
     ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) {
-  auto reifyToOperands = [&](Value v, std::optional<int64_t> d) {
+  auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
+                             ValueBoundsConstraintSet &cstr) {
     return v != value;
   };
   return reifyValueBound(b, loc, type, value, /*dim=*/std::nullopt,
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index af59973d7a92..268050a30e00 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -2765,3 +2765,51 @@ mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
 
   return success();
 }
+
+LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) {
+  LogicalResult result(failure());
+  SmallVector<AffineForOp> loops;
+  getPerfectlyNestedLoops(loops, op);
+  if (loops.size() <= 1)
+    return success();
+
+  // Look for a band of loops that can be coalesced, i.e. perfectly nested
+  // loops with bounds defined above some loop.
+  // 1. For each loop, find above which parent loop its operands are
+  // defined.
+  SmallVector<unsigned> operandsDefinedAbove(loops.size());
+  for (unsigned i = 0, e = loops.size(); i < e; ++i) {
+    operandsDefinedAbove[i] = i;
+    for (unsigned j = 0; j < i; ++j) {
+      if (areValuesDefinedAbove(loops[i].getOperands(), loops[j].getRegion())) {
+        operandsDefinedAbove[i] = j;
+        break;
+      }
+    }
+  }
+
+  // 2. Identify bands of loops such that the operands of all of them are
+  // defined above the first loop in the band.  Traverse the nest bottom-up
+  // so that modifications don't invalidate the inner loops.
+  for (unsigned end = loops.size(); end > 0; --end) {
+    unsigned start = 0;
+    for (; start < end - 1; ++start) {
+      auto maxPos =
+          *std::max_element(std::next(operandsDefinedAbove.begin(), start),
+                            std::next(operandsDefinedAbove.begin(), end));
+      if (maxPos > start)
+        continue;
+      assert(maxPos == start &&
+             "expected loop bounds to be known at the start of the band");
+      auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
+      if (succeeded(coalesceLoops(band)))
+        result = success();
+      break;
+    }
+    // If a band was found and transformed, keep looking at the loops above
+    // the outermost transformed loop.
+    if (start != end - 1)
+      end = start + 1;
+  }
+  return result;
+}
diff --git a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
index 8d9fd1478aa9..fad221288f19 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
@@ -119,7 +119,8 @@ FailureOr<OpFoldResult> mlir::arith::reifyShapedValueDimBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
     int64_t dim, ValueBoundsConstraintSet::StopConditionFn stopCondition,
     bool closedUB) {
-  auto reifyToOperands = [&](Value v, std::optional<int64_t> d) {
+  auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
+                             ValueBoundsConstraintSet &cstr) {
     // We are trying to reify a bound for `value` in terms of the owning op's
     // operands. Construct a stop condition that evaluates to "true" for any SSA
     // value expect for `value`. I.e., the bound will be computed in terms of
@@ -135,7 +136,8 @@ FailureOr<OpFoldResult> mlir::arith::reifyShapedValueDimBound(
 FailureOr<OpFoldResult> mlir::arith::reifyIndexValueBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
     ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) {
-  auto reifyToOperands = [&](Value v, std::optional<int64_t> d) {
+  auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
+                             ValueBoundsConstraintSet &cstr) {
     return v != value;
   };
   return reifyValueBound(b, loc, type, value, /*dim=*/std::nullopt,
diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
index 5d11f8f6cc45..1320db3f9e54 100644
--- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
@@ -531,8 +531,8 @@ static ParseResult parseSwitchOpCases(
         failed(parser.parseSuccessor(destination)))
       return failure();
     if (succeeded(parser.parseOptionalLParen())) {
-      if (failed(parser.parseOperandList(operands, OpAsmParser::Delimiter::None,
-                                         /*allowResultNumber=*/false)) ||
+      if (failed(parser.parseOperandList(operands,
+                                         OpAsmParser::Delimiter::None)) ||
           failed(parser.parseColonTypeList(operandTypes)) ||
           failed(parser.parseRParen()))
         return failure();
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index 06c1fdd2eb2d..563755297c57 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -168,13 +168,6 @@ bool LLVM::StoreOp::canUsesBeRemoved(
 DeletionKind LLVM::StoreOp::removeBlockingUses(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
     RewriterBase &rewriter, Value reachingDefinition) {
-  // `canUsesBeRemoved` checked this blocking use must be the stored slot
-  // pointer.
-  for (Operation *user : slot.ptr.getUsers())
-    if (auto declareOp = dyn_cast<LLVM::DbgDeclareOp>(user))
-      rewriter.create<LLVM::DbgValueOp>(declareOp->getLoc(), getValue(),
-                                        declareOp.getVarInfo(),
-                                        declareOp.getLocationExpr());
   return DeletionKind::Delete;
 }
 
@@ -407,6 +400,18 @@ DeletionKind LLVM::DbgValueOp::removeBlockingUses(
   return DeletionKind::Keep;
 }
 
+bool LLVM::DbgDeclareOp::requiresReplacedValues() { return true; }
+
+void LLVM::DbgDeclareOp::visitReplacedValues(
+    ArrayRef<std::pair<Operation *, Value>> definitions,
+    RewriterBase &rewriter) {
+  for (auto [op, value] : definitions) {
+    rewriter.setInsertionPointAfter(op);
+    rewriter.create<LLVM::DbgValueOp>(getLoc(), value, getVarInfo(),
+                                      getLocationExpr());
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Interfaces for GEPOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 2d7219fef87c..9c5c58fa1fab 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -373,14 +373,15 @@ namespace {
 
 class RegionBuilderHelper {
 public:
-  RegionBuilderHelper(MLIRContext *context, Block &block)
-      : context(context), block(block) {}
+  RegionBuilderHelper(OpBuilder &builder, Block &block)
+      : builder(builder), block(block) {}
 
   // Build the unary functions defined by OpDSL.
   Value buildUnaryFn(UnaryFn unaryFn, Value arg) {
     if (!isFloatingPoint(arg))
       llvm_unreachable("unsupported non numeric type");
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     switch (unaryFn) {
     case UnaryFn::exp:
       return builder.create<math::ExpOp>(arg.getLoc(), arg);
@@ -407,7 +408,8 @@ public:
                    arg1.getType().getIntOrFloatBitWidth() == 1;
     if (!allComplex && !allFloatingPoint && !allInteger)
       llvm_unreachable("unsupported non numeric type");
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     switch (binaryFn) {
     case BinaryFn::add:
       if (allComplex)
@@ -481,29 +483,32 @@ public:
   }
 
   void yieldOutputs(ValueRange values) {
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     Location loc = builder.getUnknownLoc();
     builder.create<YieldOp>(loc, values);
   }
 
   Value constant(const std::string &value) {
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     Location loc = builder.getUnknownLoc();
     Attribute valueAttr = parseAttribute(value, builder.getContext());
     return builder.create<arith::ConstantOp>(loc, ::cast<TypedAttr>(valueAttr));
   }
 
   Value index(int64_t dim) {
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     return builder.create<IndexOp>(builder.getUnknownLoc(), dim);
   }
 
   Type getIntegerType(unsigned width) {
-    return IntegerType::get(context, width);
+    return IntegerType::get(builder.getContext(), width);
   }
 
-  Type getFloat32Type() { return Float32Type::get(context); }
-  Type getFloat64Type() { return Float64Type::get(context); }
+  Type getFloat32Type() { return Float32Type::get(builder.getContext()); }
+  Type getFloat64Type() { return Float64Type::get(builder.getContext()); }
 
 private:
   // Generates operations to cast the given operand to a specified type.
@@ -511,7 +516,8 @@ private:
   // operand returned as-is (which will presumably yield a verification
   // issue downstream).
   Value cast(Type toType, Value operand, bool isUnsignedCast) {
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     auto loc = operand.getLoc();
     return convertScalarToDtype(builder, loc, operand, toType, isUnsignedCast);
   }
@@ -526,13 +532,7 @@ private:
     return llvm::isa<IntegerType>(value.getType());
   }
 
-  OpBuilder getBuilder() {
-    OpBuilder builder(context);
-    builder.setInsertionPointToEnd(&block);
-    return builder;
-  }
-
-  MLIRContext *context;
+  OpBuilder &builder;
   Block &block;
 };
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
index b32ea8eebaec..c3a08ce86082 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -468,7 +468,7 @@ HoistPaddingAnalysis::getHoistedPackedTensorSizes(RewriterBase &rewriter,
     FailureOr<OpFoldResult> loopUb = affine::reifyIndexValueBound(
         rewriter, loc, presburger::BoundType::UB, forOp.getUpperBound(),
         /*stopCondition=*/
-        [&](Value v, std::optional<int64_t> d) {
+        [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
           if (v == forOp.getUpperBound())
             return false;
           // Compute a bound that is independent of any affine op results.
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index bf5875071e0d..a04343154a4d 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1538,6 +1538,21 @@ static void printAtomicReductionRegion(OpAsmPrinter &printer,
   printer.printRegion(region);
 }
 
+static ParseResult parseCleanupReductionRegion(OpAsmParser &parser,
+                                               Region &region) {
+  if (parser.parseOptionalKeyword("cleanup"))
+    return success();
+  return parser.parseRegion(region);
+}
+
+static void printCleanupReductionRegion(OpAsmPrinter &printer,
+                                        DeclareReductionOp op, Region &region) {
+  if (region.empty())
+    return;
+  printer << "cleanup ";
+  printer.printRegion(region);
+}
+
 LogicalResult DeclareReductionOp::verifyRegions() {
   if (getInitializerRegion().empty())
     return emitOpError() << "expects non-empty initializer region";
@@ -1571,21 +1586,29 @@ LogicalResult DeclareReductionOp::verifyRegions() {
                               "of the reduction type";
   }
 
-  if (getAtomicReductionRegion().empty())
+  if (!getAtomicReductionRegion().empty()) {
+    Block &atomicReductionEntryBlock = getAtomicReductionRegion().front();
+    if (atomicReductionEntryBlock.getNumArguments() != 2 ||
+        atomicReductionEntryBlock.getArgumentTypes()[0] !=
+            atomicReductionEntryBlock.getArgumentTypes()[1])
+      return emitOpError() << "expects atomic reduction region with two "
+                              "arguments of the same type";
+    auto ptrType = llvm::dyn_cast<PointerLikeType>(
+        atomicReductionEntryBlock.getArgumentTypes()[0]);
+    if (!ptrType ||
+        (ptrType.getElementType() && ptrType.getElementType() != getType()))
+      return emitOpError() << "expects atomic reduction region arguments to "
+                              "be accumulators containing the reduction type";
+  }
+
+  if (getCleanupRegion().empty())
     return success();
+  Block &cleanupEntryBlock = getCleanupRegion().front();
+  if (cleanupEntryBlock.getNumArguments() != 1 ||
+      cleanupEntryBlock.getArgument(0).getType() != getType())
+    return emitOpError() << "expects cleanup region with one argument "
+                            "of the reduction type";
 
-  Block &atomicReductionEntryBlock = getAtomicReductionRegion().front();
-  if (atomicReductionEntryBlock.getNumArguments() != 2 ||
-      atomicReductionEntryBlock.getArgumentTypes()[0] !=
-          atomicReductionEntryBlock.getArgumentTypes()[1])
-    return emitOpError() << "expects atomic reduction region with two "
-                            "arguments of the same type";
-  auto ptrType = llvm::dyn_cast<PointerLikeType>(
-      atomicReductionEntryBlock.getArgumentTypes()[0]);
-  if (!ptrType ||
-      (ptrType.getElementType() && ptrType.getElementType() != getType()))
-    return emitOpError() << "expects atomic reduction region arguments to "
-                            "be accumulators containing the reduction type";
   return success();
 }
 
diff --git a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
index cb36e0cecf0d..1e13e60068ee 100644
--- a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -58,7 +58,7 @@ struct ForOpInterface
     ValueDimList boundOperands;
     LogicalResult status = ValueBoundsConstraintSet::computeBound(
         bound, boundOperands, BoundType::EQ, yieldedValue, dim,
-        [&](Value v, std::optional<int64_t> d) {
+        [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
           // Stop when reaching a block argument of the loop body.
           if (auto bbArg = llvm::dyn_cast<BlockArgument>(v))
             return bbArg.getOwner()->getParentOp() == forOp;
diff --git a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
index c09184148208..7e4faf8b73af 100644
--- a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
+++ b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
@@ -332,9 +332,9 @@ transform::LoopCoalesceOp::applyToOne(transform::TransformRewriter &rewriter,
                                       transform::TransformState &state) {
   LogicalResult result(failure());
   if (scf::ForOp scfForOp = dyn_cast<scf::ForOp>(op))
-    result = coalescePerfectlyNestedLoops(scfForOp);
+    result = coalescePerfectlyNestedSCFForLoops(scfForOp);
   else if (AffineForOp affineForOp = dyn_cast<AffineForOp>(op))
-    result = coalescePerfectlyNestedLoops(affineForOp);
+    result = coalescePerfectlyNestedAffineLoops(affineForOp);
 
   results.push_back(op);
   if (failed(result)) {
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
index a69df025bcba..6ba7020e86fa 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopCollapsing.cpp
@@ -28,6 +28,7 @@ namespace {
 struct TestSCFParallelLoopCollapsing
     : public impl::TestSCFParallelLoopCollapsingBase<
           TestSCFParallelLoopCollapsing> {
+
   void runOnOperation() override {
     Operation *module = getOperation();
 
@@ -88,6 +89,7 @@ struct TestSCFParallelLoopCollapsing
     // Only apply the transformation on parallel loops where the specified
     // transformation is valid, but do NOT early abort in the case of invalid
     // loops.
+    IRRewriter rewriter(&getContext());
     module->walk([&](scf::ParallelOp op) {
       if (flattenedCombinedLoops.size() != op.getNumLoops()) {
         op.emitOpError("has ")
@@ -97,7 +99,7 @@ struct TestSCFParallelLoopCollapsing
             << flattenedCombinedLoops.size() << " iter args.";
         return;
       }
-      collapseParallelLoops(op, combinedLoops);
+      collapseParallelLoops(rewriter, op, combinedLoops);
     });
   }
 };
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index 914aeb4fa79f..9279081cfd45 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -472,18 +473,23 @@ LogicalResult mlir::loopUnrollByFactor(
   return success();
 }
 
-/// Return the new lower bound, upper bound, and step in that order. Insert any
-/// additional bounds calculations before the given builder and any additional
-/// conversion back to the original loop induction value inside the given Block.
-static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
-                                OpBuilder &insideLoopBuilder, Location loc,
-                                Value lowerBound, Value upperBound, Value step,
-                                Value inductionVar) {
+/// Transform a loop with a strictly positive step
+///   for %i = %lb to %ub step %s
+/// into a 0-based loop with step 1
+///   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
+///     %i = %ii * %s + %lb
+/// Insert the induction variable remapping in the body of `inner`, which is
+/// expected to be either `loop` or another loop perfectly nested under `loop`.
+/// Insert the definition of new bounds immediate before `outer`, which is
+/// expected to be either `loop` or its parent in the loop nest.
+static LoopParams emitNormalizedLoopBounds(RewriterBase &rewriter, Location loc,
+                                           Value lb, Value ub, Value step) {
+  // For non-index types, generate `arith` instructions
   // Check if the loop is already known to have a constant zero lower bound or
   // a constant one step.
   bool isZeroBased = false;
-  if (auto ubCst = getConstantIntValue(lowerBound))
-    isZeroBased = ubCst.value() == 0;
+  if (auto lbCst = getConstantIntValue(lb))
+    isZeroBased = lbCst.value() == 0;
 
   bool isStepOne = false;
   if (auto stepCst = getConstantIntValue(step))
@@ -493,62 +499,90 @@ static LoopParams normalizeLoop(OpBuilder &boundsBuilder,
   // assuming the step is strictly positive.  Update the bounds and the step
   // of the loop to go from 0 to the number of iterations, if necessary.
   if (isZeroBased && isStepOne)
-    return {/*lowerBound=*/lowerBound, /*upperBound=*/upperBound,
-            /*step=*/step};
+    return {lb, ub, step};
 
-  Value diff = boundsBuilder.create<arith::SubIOp>(loc, upperBound, lowerBound);
+  Value diff = isZeroBased ? ub : rewriter.create<arith::SubIOp>(loc, ub, lb);
   Value newUpperBound =
-      boundsBuilder.create<arith::CeilDivSIOp>(loc, diff, step);
-
-  Value newLowerBound =
-      isZeroBased ? lowerBound
-                  : boundsBuilder.create<arith::ConstantOp>(
-                        loc, boundsBuilder.getZeroAttr(lowerBound.getType()));
-  Value newStep =
-      isStepOne ? step
-                : boundsBuilder.create<arith::ConstantOp>(
-                      loc, boundsBuilder.getIntegerAttr(step.getType(), 1));
-
-  // Insert code computing the value of the original loop induction variable
-  // from the "normalized" one.
-  Value scaled =
-      isStepOne
-          ? inductionVar
-          : insideLoopBuilder.create<arith::MulIOp>(loc, inductionVar, step);
-  Value shifted =
-      isZeroBased
-          ? scaled
-          : insideLoopBuilder.create<arith::AddIOp>(loc, scaled, lowerBound);
-
-  SmallPtrSet<Operation *, 2> preserve{scaled.getDefiningOp(),
-                                       shifted.getDefiningOp()};
-  inductionVar.replaceAllUsesExcept(shifted, preserve);
-  return {/*lowerBound=*/newLowerBound, /*upperBound=*/newUpperBound,
-          /*step=*/newStep};
+      isStepOne ? diff : rewriter.create<arith::CeilDivSIOp>(loc, diff, step);
+
+  Value newLowerBound = isZeroBased
+                            ? lb
+                            : rewriter.create<arith::ConstantOp>(
+                                  loc, rewriter.getZeroAttr(lb.getType()));
+  Value newStep = isStepOne
+                      ? step
+                      : rewriter.create<arith::ConstantOp>(
+                            loc, rewriter.getIntegerAttr(step.getType(), 1));
+
+  return {newLowerBound, newUpperBound, newStep};
 }
 
-/// Transform a loop with a strictly positive step
-///   for %i = %lb to %ub step %s
-/// into a 0-based loop with step 1
-///   for %ii = 0 to ceildiv(%ub - %lb, %s) step 1 {
-///     %i = %ii * %s + %lb
-/// Insert the induction variable remapping in the body of `inner`, which is
-/// expected to be either `loop` or another loop perfectly nested under `loop`.
-/// Insert the definition of new bounds immediate before `outer`, which is
-/// expected to be either `loop` or its parent in the loop nest.
-static void normalizeLoop(scf::ForOp loop, scf::ForOp outer, scf::ForOp inner) {
-  OpBuilder builder(outer);
-  OpBuilder innerBuilder = OpBuilder::atBlockBegin(inner.getBody());
-  auto loopPieces = normalizeLoop(builder, innerBuilder, loop.getLoc(),
-                                  loop.getLowerBound(), loop.getUpperBound(),
-                                  loop.getStep(), loop.getInductionVar());
-
-  loop.setLowerBound(loopPieces.lowerBound);
-  loop.setUpperBound(loopPieces.upperBound);
-  loop.setStep(loopPieces.step);
+/// Get back the original induction variable values after loop normalization
+static void denormalizeInductionVariable(RewriterBase &rewriter, Location loc,
+                                         Value normalizedIv, Value origLb,
+                                         Value origStep) {
+  Value denormalizedIv;
+  SmallPtrSet<Operation *, 2> preserve;
+  bool isStepOne = isConstantIntValue(origStep, 1);
+  bool isZeroBased = isConstantIntValue(origLb, 0);
+
+  Value scaled = normalizedIv;
+  if (!isStepOne) {
+    scaled = rewriter.create<arith::MulIOp>(loc, normalizedIv, origStep);
+    preserve.insert(scaled.getDefiningOp());
+  }
+  denormalizedIv = scaled;
+  if (!isZeroBased) {
+    denormalizedIv = rewriter.create<arith::AddIOp>(loc, scaled, origLb);
+    preserve.insert(denormalizedIv.getDefiningOp());
+  }
+
+  rewriter.replaceAllUsesExcept(normalizedIv, denormalizedIv, preserve);
 }
 
-LogicalResult mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
+/// Helper function to multiply a sequence of values.
+static Value getProductOfIntsOrIndexes(RewriterBase &rewriter, Location loc,
+                                       ArrayRef<Value> values) {
+  assert(!values.empty() && "unexpected empty list");
+  Value productOf = values.front();
+  for (auto v : values.drop_front()) {
+    productOf = rewriter.create<arith::MulIOp>(loc, productOf, v);
+  }
+  return productOf;
+}
+
+/// For each original loop, the value of the
+/// induction variable can be obtained by dividing the induction variable of
+/// the linearized loop by the total number of iterations of the loops nested
+/// in it modulo the number of iterations in this loop (remove the values
+/// related to the outer loops):
+///   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
+/// Compute these iteratively from the innermost loop by creating a "running
+/// quotient" of division by the range.
+static std::pair<SmallVector<Value>, SmallPtrSet<Operation *, 2>>
+delinearizeInductionVariable(RewriterBase &rewriter, Location loc,
+                             Value linearizedIv, ArrayRef<Value> ubs) {
+  Value previous = linearizedIv;
+  SmallVector<Value> delinearizedIvs(ubs.size());
+  SmallPtrSet<Operation *, 2> preservedUsers;
+  for (unsigned i = 0, e = ubs.size(); i < e; ++i) {
+    unsigned idx = ubs.size() - i - 1;
+    if (i != 0) {
+      previous = rewriter.create<arith::DivSIOp>(loc, previous, ubs[idx + 1]);
+      preservedUsers.insert(previous.getDefiningOp());
+    }
+    Value iv = previous;
+    if (i != e - 1) {
+      iv = rewriter.create<arith::RemSIOp>(loc, previous, ubs[idx]);
+      preservedUsers.insert(iv.getDefiningOp());
+    }
+    delinearizedIvs[idx] = iv;
+  }
+  return {delinearizedIvs, preservedUsers};
+}
+
+LogicalResult mlir::coalesceLoops(RewriterBase &rewriter,
+                                  MutableArrayRef<scf::ForOp> loops) {
   if (loops.size() < 2)
     return failure();
 
@@ -557,57 +591,148 @@ LogicalResult mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
 
   // 1. Make sure all loops iterate from 0 to upperBound with step 1.  This
   // allows the following code to assume upperBound is the number of iterations.
-  for (auto loop : loops)
-    normalizeLoop(loop, outermost, innermost);
+  for (auto loop : loops) {
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPoint(outermost);
+    Value lb = loop.getLowerBound();
+    Value ub = loop.getUpperBound();
+    Value step = loop.getStep();
+    auto newLoopParams =
+        emitNormalizedLoopBounds(rewriter, loop.getLoc(), lb, ub, step);
+
+    rewriter.modifyOpInPlace(loop, [&]() {
+      loop.setLowerBound(newLoopParams.lowerBound);
+      loop.setUpperBound(newLoopParams.upperBound);
+      loop.setStep(newLoopParams.step);
+    });
+
+    rewriter.setInsertionPointToStart(innermost.getBody());
+    denormalizeInductionVariable(rewriter, loop.getLoc(),
+                                 loop.getInductionVar(), lb, step);
+  }
 
   // 2. Emit code computing the upper bound of the coalesced loop as product
   // of the number of iterations of all loops.
-  OpBuilder builder(outermost);
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(outermost);
   Location loc = outermost.getLoc();
-  Value upperBound = outermost.getUpperBound();
-  for (auto loop : loops.drop_front())
-    upperBound =
-        builder.create<arith::MulIOp>(loc, upperBound, loop.getUpperBound());
+  SmallVector<Value> upperBounds = llvm::map_to_vector(
+      loops, [](auto loop) { return loop.getUpperBound(); });
+  Value upperBound = getProductOfIntsOrIndexes(rewriter, loc, upperBounds);
   outermost.setUpperBound(upperBound);
 
-  builder.setInsertionPointToStart(outermost.getBody());
-
-  // 3. Remap induction variables. For each original loop, the value of the
-  // induction variable can be obtained by dividing the induction variable of
-  // the linearized loop by the total number of iterations of the loops nested
-  // in it modulo the number of iterations in this loop (remove the values
-  // related to the outer loops):
-  //   iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
-  // Compute these iteratively from the innermost loop by creating a "running
-  // quotient" of division by the range.
-  Value previous = outermost.getInductionVar();
+  rewriter.setInsertionPointToStart(innermost.getBody());
+  auto [delinearizeIvs, preservedUsers] = delinearizeInductionVariable(
+      rewriter, loc, outermost.getInductionVar(), upperBounds);
+  rewriter.replaceAllUsesExcept(outermost.getInductionVar(), delinearizeIvs[0],
+                                preservedUsers);
+
+  for (int i = loops.size() - 1; i > 0; --i) {
+    auto outerLoop = loops[i - 1];
+    auto innerLoop = loops[i];
+
+    Operation *innerTerminator = innerLoop.getBody()->getTerminator();
+    auto yieldedVals = llvm::to_vector(innerTerminator->getOperands());
+    rewriter.eraseOp(innerTerminator);
+
+    SmallVector<Value> innerBlockArgs;
+    innerBlockArgs.push_back(delinearizeIvs[i]);
+    llvm::append_range(innerBlockArgs, outerLoop.getRegionIterArgs());
+    rewriter.inlineBlockBefore(innerLoop.getBody(), outerLoop.getBody(),
+                               Block::iterator(innerLoop), innerBlockArgs);
+    rewriter.replaceOp(innerLoop, yieldedVals);
+  }
+  return success();
+}
+
+LogicalResult mlir::coalesceLoops(MutableArrayRef<scf::ForOp> loops) {
+  if (loops.empty()) {
+    return failure();
+  }
+  IRRewriter rewriter(loops.front().getContext());
+  return coalesceLoops(rewriter, loops);
+}
+
+LogicalResult mlir::coalescePerfectlyNestedSCFForLoops(scf::ForOp op) {
+  LogicalResult result(failure());
+  SmallVector<scf::ForOp> loops;
+  getPerfectlyNestedLoops(loops, op);
+
+  // Look for a band of loops that can be coalesced, i.e. perfectly nested
+  // loops with bounds defined above some loop.
+
+  // 1. For each loop, find above which parent loop its bounds operands are
+  // defined.
+  SmallVector<unsigned> operandsDefinedAbove(loops.size());
   for (unsigned i = 0, e = loops.size(); i < e; ++i) {
-    unsigned idx = loops.size() - i - 1;
-    if (i != 0)
-      previous = builder.create<arith::DivSIOp>(loc, previous,
-                                                loops[idx + 1].getUpperBound());
-
-    Value iv = (i == e - 1) ? previous
-                            : builder.create<arith::RemSIOp>(
-                                  loc, previous, loops[idx].getUpperBound());
-    replaceAllUsesInRegionWith(loops[idx].getInductionVar(), iv,
-                               loops.back().getRegion());
+    operandsDefinedAbove[i] = i;
+    for (unsigned j = 0; j < i; ++j) {
+      SmallVector<Value> boundsOperands = {loops[i].getLowerBound(),
+                                           loops[i].getUpperBound(),
+                                           loops[i].getStep()};
+      if (areValuesDefinedAbove(boundsOperands, loops[j].getRegion())) {
+        operandsDefinedAbove[i] = j;
+        break;
+      }
+    }
   }
 
-  // 4. Move the operations from the innermost just above the second-outermost
-  // loop, delete the extra terminator and the second-outermost loop.
-  scf::ForOp second = loops[1];
-  innermost.getBody()->back().erase();
-  outermost.getBody()->getOperations().splice(
-      Block::iterator(second.getOperation()),
-      innermost.getBody()->getOperations());
-  second.erase();
-  return success();
+  // 2. For each inner loop check that the iter_args for the immediately outer
+  // loop are the init for the immediately inner loop and that the yields of the
+  // return of the inner loop is the yield for the immediately outer loop. Keep
+  // track of where the chain starts from for each loop.
+  SmallVector<unsigned> iterArgChainStart(loops.size());
+  iterArgChainStart[0] = 0;
+  for (unsigned i = 1, e = loops.size(); i < e; ++i) {
+    // By default set the start of the chain to itself.
+    iterArgChainStart[i] = i;
+    auto outerloop = loops[i - 1];
+    auto innerLoop = loops[i];
+    if (outerloop.getNumRegionIterArgs() != innerLoop.getNumRegionIterArgs()) {
+      continue;
+    }
+    if (!llvm::equal(outerloop.getRegionIterArgs(), innerLoop.getInitArgs())) {
+      continue;
+    }
+    auto outerloopTerminator = outerloop.getBody()->getTerminator();
+    if (!llvm::equal(outerloopTerminator->getOperands(),
+                     innerLoop.getResults())) {
+      continue;
+    }
+    iterArgChainStart[i] = iterArgChainStart[i - 1];
+  }
+
+  // 3. Identify bands of loops such that the operands of all of them are
+  // defined above the first loop in the band.  Traverse the nest bottom-up
+  // so that modifications don't invalidate the inner loops.
+  for (unsigned end = loops.size(); end > 0; --end) {
+    unsigned start = 0;
+    for (; start < end - 1; ++start) {
+      auto maxPos =
+          *std::max_element(std::next(operandsDefinedAbove.begin(), start),
+                            std::next(operandsDefinedAbove.begin(), end));
+      if (maxPos > start)
+        continue;
+      if (iterArgChainStart[end - 1] > start)
+        continue;
+      auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
+      if (succeeded(coalesceLoops(band)))
+        result = success();
+      break;
+    }
+    // If a band was found and transformed, keep looking at the loops above
+    // the outermost transformed loop.
+    if (start != end - 1)
+      end = start + 1;
+  }
+  return result;
 }
 
 void mlir::collapseParallelLoops(
-    scf::ParallelOp loops, ArrayRef<std::vector<unsigned>> combinedDimensions) {
-  OpBuilder outsideBuilder(loops);
+    RewriterBase &rewriter, scf::ParallelOp loops,
+    ArrayRef<std::vector<unsigned>> combinedDimensions) {
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(loops);
   Location loc = loops.getLoc();
 
   // Presort combined dimensions.
@@ -619,25 +744,29 @@ void mlir::collapseParallelLoops(
   SmallVector<Value, 3> normalizedLowerBounds, normalizedSteps,
       normalizedUpperBounds;
   for (unsigned i = 0, e = loops.getNumLoops(); i < e; ++i) {
-    OpBuilder insideLoopBuilder = OpBuilder::atBlockBegin(loops.getBody());
-    auto resultBounds =
-        normalizeLoop(outsideBuilder, insideLoopBuilder, loc,
-                      loops.getLowerBound()[i], loops.getUpperBound()[i],
-                      loops.getStep()[i], loops.getBody()->getArgument(i));
-
-    normalizedLowerBounds.push_back(resultBounds.lowerBound);
-    normalizedUpperBounds.push_back(resultBounds.upperBound);
-    normalizedSteps.push_back(resultBounds.step);
+    OpBuilder::InsertionGuard g2(rewriter);
+    rewriter.setInsertionPoint(loops);
+    Value lb = loops.getLowerBound()[i];
+    Value ub = loops.getUpperBound()[i];
+    Value step = loops.getStep()[i];
+    auto newLoopParams = emitNormalizedLoopBounds(rewriter, loc, lb, ub, step);
+    normalizedLowerBounds.push_back(newLoopParams.lowerBound);
+    normalizedUpperBounds.push_back(newLoopParams.upperBound);
+    normalizedSteps.push_back(newLoopParams.step);
+
+    rewriter.setInsertionPointToStart(loops.getBody());
+    denormalizeInductionVariable(rewriter, loc, loops.getInductionVars()[i], lb,
+                                 step);
   }
 
   // Combine iteration spaces.
   SmallVector<Value, 3> lowerBounds, upperBounds, steps;
-  auto cst0 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 0);
-  auto cst1 = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
+  auto cst0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  auto cst1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
   for (auto &sortedDimension : sortedDimensions) {
-    Value newUpperBound = outsideBuilder.create<arith::ConstantIndexOp>(loc, 1);
+    Value newUpperBound = rewriter.create<arith::ConstantIndexOp>(loc, 1);
     for (auto idx : sortedDimension) {
-      newUpperBound = outsideBuilder.create<arith::MulIOp>(
+      newUpperBound = rewriter.create<arith::MulIOp>(
           loc, newUpperBound, normalizedUpperBounds[idx]);
     }
     lowerBounds.push_back(cst0);
@@ -651,7 +780,7 @@ void mlir::collapseParallelLoops(
   // value. The remainders then determine based on that range, which iteration
   // of the original induction value this represents. This is a normalized value
   // that is un-normalized already by the previous logic.
-  auto newPloop = outsideBuilder.create<scf::ParallelOp>(
+  auto newPloop = rewriter.create<scf::ParallelOp>(
       loc, lowerBounds, upperBounds, steps,
       [&](OpBuilder &insideBuilder, Location, ValueRange ploopIVs) {
         for (unsigned i = 0, e = combinedDimensions.size(); i < e; ++i) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index d4c17928d4ca..acea25f02398 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -274,7 +274,7 @@ struct SparseTensorCodegenPass
         });
     // The following operations and dialects may be introduced by the
     // codegen rules, and are therefore marked as legal.
-    target.addLegalOp<linalg::FillOp>();
+    target.addLegalOp<linalg::FillOp, linalg::YieldOp>();
     target.addLegalDialect<
         arith::ArithDialect, bufferization::BufferizationDialect,
         complex::ComplexDialect, memref::MemRefDialect, scf::SCFDialect>();
diff --git a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
index 6d7e3bc70f59..52359fa8a510 100644
--- a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
+++ b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
@@ -47,17 +47,26 @@ ScalableValueBoundsConstraintSet::computeScalableBound(
     unsigned vscaleMax, presburger::BoundType boundType, bool closedUB,
     StopConditionFn stopCondition) {
   using namespace presburger;
-
   assert(vscaleMin <= vscaleMax);
-  ScalableValueBoundsConstraintSet scalableCstr(value.getContext(), vscaleMin,
-                                                vscaleMax);
 
-  int64_t pos = scalableCstr.populateConstraintsSet(value, dim, stopCondition);
+  // No stop condition specified: Keep adding constraints until the worklist
+  // is empty.
+  auto defaultStopCondition = [&](Value v, std::optional<int64_t> dim,
+                                  mlir::ValueBoundsConstraintSet &cstr) {
+    return false;
+  };
+
+  ScalableValueBoundsConstraintSet scalableCstr(
+      value.getContext(), stopCondition ? stopCondition : defaultStopCondition,
+      vscaleMin, vscaleMax);
+  int64_t pos = scalableCstr.populateConstraintsSet(value, dim);
 
   // Project out all variables apart from vscale.
   // This should result in constraints in terms of vscale only.
-  scalableCstr.projectOut(
-      [&](ValueDim p) { return p.first != scalableCstr.getVscaleValue(); });
+  auto projectOutFn = [&](ValueDim p) {
+    return p.first != scalableCstr.getVscaleValue();
+  };
+  scalableCstr.projectOut(projectOutFn);
 
   assert(scalableCstr.cstr.getNumDimAndSymbolVars() ==
              scalableCstr.positionToValueDim.size() &&
diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp
index 5944a0ea46a1..286f47ce6913 100644
--- a/mlir/lib/IR/PatternMatch.cpp
+++ b/mlir/lib/IR/PatternMatch.cpp
@@ -11,6 +11,7 @@
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Iterators.h"
 #include "mlir/IR/RegionKindInterface.h"
+#include "llvm/ADT/SmallPtrSet.h"
 
 using namespace mlir;
 
@@ -250,6 +251,14 @@ void RewriterBase::finalizeOpModification(Operation *op) {
     rewriteListener->notifyOperationModified(op);
 }
 
+void RewriterBase::replaceAllUsesExcept(
+    Value from, Value to, const SmallPtrSetImpl<Operation *> &preservedUsers) {
+  return replaceUsesWithIf(from, to, [&](OpOperand &use) {
+    Operation *user = use.getOwner();
+    return !preservedUsers.contains(user);
+  });
+}
+
 void RewriterBase::replaceUsesWithIf(Value from, Value to,
                                      function_ref<bool(OpOperand &)> functor,
                                      bool *allUsesReplaced) {
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index 99598f2e89d9..0d362c7efa0a 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -67,8 +67,11 @@ static std::optional<int64_t> getConstantIntValue(OpFoldResult ofr) {
   return std::nullopt;
 }
 
-ValueBoundsConstraintSet::ValueBoundsConstraintSet(MLIRContext *ctx)
-    : builder(ctx) {}
+ValueBoundsConstraintSet::ValueBoundsConstraintSet(
+    MLIRContext *ctx, StopConditionFn stopCondition)
+    : builder(ctx), stopCondition(stopCondition) {
+  assert(stopCondition && "expected non-null stop condition");
+}
 
 char ValueBoundsConstraintSet::ID = 0;
 
@@ -193,7 +196,8 @@ static Operation *getOwnerOfValue(Value value) {
   return value.getDefiningOp();
 }
 
-void ValueBoundsConstraintSet::processWorklist(StopConditionFn stopCondition) {
+void ValueBoundsConstraintSet::processWorklist() {
+  LLVM_DEBUG(llvm::dbgs() << "Processing value bounds worklist...\n");
   while (!worklist.empty()) {
     int64_t pos = worklist.front();
     worklist.pop();
@@ -214,13 +218,19 @@ void ValueBoundsConstraintSet::processWorklist(StopConditionFn stopCondition) {
 
     // Do not process any further if the stop condition is met.
     auto maybeDim = dim == kIndexValue ? std::nullopt : std::make_optional(dim);
-    if (stopCondition(value, maybeDim))
+    if (stopCondition(value, maybeDim, *this)) {
+      LLVM_DEBUG(llvm::dbgs() << "Stop condition met for: " << value
+                              << " (dim: " << maybeDim << ")\n");
       continue;
+    }
 
     // Query `ValueBoundsOpInterface` for constraints. New items may be added to
     // the worklist.
     auto valueBoundsOp =
         dyn_cast<ValueBoundsOpInterface>(getOwnerOfValue(value));
+    LLVM_DEBUG(llvm::dbgs()
+               << "Query value bounds for: " << value
+               << " (owner: " << getOwnerOfValue(value)->getName() << ")\n");
     if (valueBoundsOp) {
       if (dim == kIndexValue) {
         valueBoundsOp.populateBoundsForIndexValue(value, *this);
@@ -229,6 +239,7 @@ void ValueBoundsConstraintSet::processWorklist(StopConditionFn stopCondition) {
       }
       continue;
     }
+    LLVM_DEBUG(llvm::dbgs() << "--> ValueBoundsOpInterface not implemented\n");
 
     // If the op does not implement `ValueBoundsOpInterface`, check if it
     // implements the `DestinationStyleOpInterface`. OpResults of such ops are
@@ -278,32 +289,20 @@ LogicalResult ValueBoundsConstraintSet::computeBound(
     bool closedUB) {
 #ifndef NDEBUG
   assertValidValueDim(value, dim);
-  assert(!stopCondition(value, dim) &&
-         "stop condition should not be satisfied for starting point");
 #endif // NDEBUG
 
   int64_t ubAdjustment = closedUB ? 0 : 1;
   Builder b(value.getContext());
   mapOperands.clear();
 
-  if (stopCondition(value, dim)) {
-    // Special case: If the stop condition is satisfied for the input
-    // value/dimension, directly return it.
-    mapOperands.push_back(std::make_pair(value, dim));
-    AffineExpr bound = b.getAffineDimExpr(0);
-    if (type == BoundType::UB)
-      bound = bound + ubAdjustment;
-    resultMap = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
-                               b.getAffineDimExpr(0));
-    return success();
-  }
-
   // Process the backward slice of `value` (i.e., reverse use-def chain) until
   // `stopCondition` is met.
   ValueDim valueDim = std::make_pair(value, dim.value_or(kIndexValue));
-  ValueBoundsConstraintSet cstr(value.getContext());
+  ValueBoundsConstraintSet cstr(value.getContext(), stopCondition);
+  assert(!stopCondition(value, dim, cstr) &&
+         "stop condition should not be satisfied for starting point");
   int64_t pos = cstr.insert(value, dim, /*isSymbol=*/false);
-  cstr.processWorklist(stopCondition);
+  cstr.processWorklist();
 
   // Project out all variables (apart from `valueDim`) that do not match the
   // stop condition.
@@ -313,7 +312,7 @@ LogicalResult ValueBoundsConstraintSet::computeBound(
       return false;
     auto maybeDim =
         p.second == kIndexValue ? std::nullopt : std::make_optional(p.second);
-    return !stopCondition(p.first, maybeDim);
+    return !stopCondition(p.first, maybeDim, cstr);
   });
 
   // Compute lower and upper bounds for `valueDim`.
@@ -419,7 +418,7 @@ LogicalResult ValueBoundsConstraintSet::computeDependentBound(
     bool closedUB) {
   return computeBound(
       resultMap, mapOperands, type, value, dim,
-      [&](Value v, std::optional<int64_t> d) {
+      [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
         return llvm::is_contained(dependencies, std::make_pair(v, d));
       },
       closedUB);
@@ -455,7 +454,9 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
   // Reify bounds in terms of any independent values.
   return computeBound(
       resultMap, mapOperands, type, value, dim,
-      [&](Value v, std::optional<int64_t> d) { return isIndependent(v); },
+      [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
+        return isIndependent(v);
+      },
       closedUB);
 }
 
@@ -488,21 +489,19 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
     presburger::BoundType type, AffineMap map, ValueDimList operands,
     StopConditionFn stopCondition, bool closedUB) {
   assert(map.getNumResults() == 1 && "expected affine map with one result");
-  ValueBoundsConstraintSet cstr(map.getContext());
 
-  int64_t pos = 0;
-  if (stopCondition) {
-    cstr.populateConstraintsSet(map, operands, stopCondition, &pos);
-  } else {
-    // No stop condition specified: Keep adding constraints until a bound could
-    // be computed.
-    cstr.populateConstraintsSet(
-        map, operands,
-        [&](Value v, std::optional<int64_t> dim) {
-          return cstr.cstr.getConstantBound64(type, pos).has_value();
-        },
-        &pos);
-  }
+  // Default stop condition if none was specified: Keep adding constraints until
+  // a bound could be computed.
+  int64_t pos;
+  auto defaultStopCondition = [&](Value v, std::optional<int64_t> dim,
+                                  ValueBoundsConstraintSet &cstr) {
+    return cstr.cstr.getConstantBound64(type, pos).has_value();
+  };
+
+  ValueBoundsConstraintSet cstr(
+      map.getContext(), stopCondition ? stopCondition : defaultStopCondition);
+  cstr.populateConstraintsSet(map, operands, &pos);
+
   // Compute constant bound for `valueDim`.
   int64_t ubAdjustment = closedUB ? 0 : 1;
   if (auto bound = cstr.cstr.getConstantBound64(type, pos))
@@ -510,8 +509,9 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
   return failure();
 }
 
-int64_t ValueBoundsConstraintSet::populateConstraintsSet(
-    Value value, std::optional<int64_t> dim, StopConditionFn stopCondition) {
+int64_t
+ValueBoundsConstraintSet::populateConstraintsSet(Value value,
+                                                 std::optional<int64_t> dim) {
 #ifndef NDEBUG
   assertValidValueDim(value, dim);
 #endif // NDEBUG
@@ -519,12 +519,12 @@ int64_t ValueBoundsConstraintSet::populateConstraintsSet(
   AffineMap map =
       AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
                      Builder(value.getContext()).getAffineDimExpr(0));
-  return populateConstraintsSet(map, {{value, dim}}, stopCondition);
+  return populateConstraintsSet(map, {{value, dim}});
 }
 
-int64_t ValueBoundsConstraintSet::populateConstraintsSet(
-    AffineMap map, ValueDimList operands, StopConditionFn stopCondition,
-    int64_t *posOut) {
+int64_t ValueBoundsConstraintSet::populateConstraintsSet(AffineMap map,
+                                                         ValueDimList operands,
+                                                         int64_t *posOut) {
   assert(map.getNumResults() == 1 && "expected affine map with one result");
   int64_t pos = insert(/*isSymbol=*/false);
   if (posOut)
@@ -545,13 +545,7 @@ int64_t ValueBoundsConstraintSet::populateConstraintsSet(
 
   // Process the backward slice of `operands` (i.e., reverse use-def chain)
   // until `stopCondition` is met.
-  if (stopCondition) {
-    processWorklist(stopCondition);
-  } else {
-    // No stop condition specified: Keep adding constraints until the worklist
-    // is empty.
-    processWorklist([](Value v, std::optional<int64_t> dim) { return false; });
-  }
+  processWorklist();
 
   return pos;
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 646d0ed73084..08ec57803aff 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -804,13 +804,13 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
 
 /// Allocate space for privatized reduction variables.
 template <typename T>
-static void
-allocByValReductionVars(T loop, llvm::IRBuilderBase &builder,
-                        LLVM::ModuleTranslation &moduleTranslation,
-                        llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
-                        SmallVector<omp::DeclareReductionOp> &reductionDecls,
-                        SmallVector<llvm::Value *> &privateReductionVariables,
-                        DenseMap<Value, llvm::Value *> &reductionVariableMap) {
+static void allocByValReductionVars(
+    T loop, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation,
+    llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+    SmallVectorImpl<llvm::Value *> &privateReductionVariables,
+    DenseMap<Value, llvm::Value *> &reductionVariableMap) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.restoreIP(allocaIP);
   auto args =
@@ -825,6 +825,25 @@ allocByValReductionVars(T loop, llvm::IRBuilderBase &builder,
   }
 }
 
+/// Map input argument to all reduction initialization regions
+template <typename T>
+static void
+mapInitializationArg(T loop, LLVM::ModuleTranslation &moduleTranslation,
+                     SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+                     unsigned i) {
+  // map input argument to the initialization region
+  mlir::omp::DeclareReductionOp &reduction = reductionDecls[i];
+  Region &initializerRegion = reduction.getInitializerRegion();
+  Block &entry = initializerRegion.front();
+  assert(entry.getNumArguments() == 1 &&
+         "the initialization region has one argument");
+
+  mlir::Value mlirSource = loop.getReductionVars()[i];
+  llvm::Value *llvmSource = moduleTranslation.lookupValue(mlirSource);
+  assert(llvmSource && "lookup reduction var");
+  moduleTranslation.mapValue(entry.getArgument(0), llvmSource);
+}
+
 /// Collect reduction info
 template <typename T>
 static void collectReductionInfo(
@@ -858,6 +877,32 @@ static void collectReductionInfo(
   }
 }
 
+/// handling of DeclareReductionOp's cleanup region
+static LogicalResult inlineReductionCleanup(
+    llvm::SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+    llvm::ArrayRef<llvm::Value *> privateReductionVariables,
+    LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder) {
+  for (auto [i, reductionDecl] : llvm::enumerate(reductionDecls)) {
+    Region &cleanupRegion = reductionDecl.getCleanupRegion();
+    if (cleanupRegion.empty())
+      continue;
+
+    // map the argument to the cleanup region
+    Block &entry = cleanupRegion.front();
+    moduleTranslation.mapValue(entry.getArgument(0),
+                               privateReductionVariables[i]);
+
+    if (failed(inlineConvertOmpRegions(cleanupRegion, "omp.reduction.cleanup",
+                                       builder, moduleTranslation)))
+      return failure();
+
+    // clear block argument mapping in case it needs to be re-created with a
+    // different source for another use of the same reduction decl
+    moduleTranslation.forgetMapping(cleanupRegion);
+  }
+  return success();
+}
+
 /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -902,6 +947,10 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
       loop.getRegion().getArguments().take_back(loop.getNumReductionVars());
   for (unsigned i = 0; i < loop.getNumReductionVars(); ++i) {
     SmallVector<llvm::Value *> phis;
+
+    // map block argument to initializer region
+    mapInitializationArg(loop, moduleTranslation, reductionDecls, i);
+
     if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
                                        "omp.reduction.neutral", builder,
                                        moduleTranslation, &phis)))
@@ -925,6 +974,11 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
       builder.CreateStore(phis[0], privateReductionVariables[i]);
       // the rest was handled in allocByValReductionVars
     }
+
+    // forget the mapping for the initializer region because we might need a
+    // different mapping if this reduction declaration is re-used for a
+    // different variable
+    moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion());
   }
 
   // Store the mapping between reduction variables and their private copies on
@@ -1044,7 +1098,9 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   tempTerminator->eraseFromParent();
   builder.restoreIP(nextInsertionPoint);
 
-  return success();
+  // after the workshare loop, deallocate private reduction variables
+  return inlineReductionCleanup(reductionDecls, privateReductionVariables,
+                                moduleTranslation, builder);
 }
 
 /// A RAII class that on construction replaces the region arguments of the
@@ -1097,13 +1153,13 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   LogicalResult bodyGenStatus = success();
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
-    // Collect reduction declarations
-    SmallVector<omp::DeclareReductionOp> reductionDecls;
-    collectReductionDecls(opInst, reductionDecls);
+  // Collect reduction declarations
+  SmallVector<omp::DeclareReductionOp> reductionDecls;
+  collectReductionDecls(opInst, reductionDecls);
+  SmallVector<llvm::Value *> privateReductionVariables;
 
+  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // Allocate reduction vars
-    SmallVector<llvm::Value *> privateReductionVariables;
     DenseMap<Value, llvm::Value *> reductionVariableMap;
     if (!isByRef) {
       allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP,
@@ -1118,6 +1174,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
             opInst.getNumReductionVars());
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
+
+      // map the block argument
+      mapInitializationArg(opInst, moduleTranslation, reductionDecls, i);
       if (failed(inlineConvertOmpRegions(
               reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral",
               builder, moduleTranslation, &phis)))
@@ -1144,6 +1203,10 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
         builder.CreateStore(phis[0], privateReductionVariables[i]);
         // the rest is done in allocByValReductionVars
       }
+
+      // clear block argument mapping in case it needs to be re-created with a
+      // different source for another use of the same reduction decl
+      moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion());
     }
 
     // Store the mapping between reduction variables and their private copies on
@@ -1296,7 +1359,18 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
   // TODO: Perform finalization actions for variables. This has to be
   // called for variables which have destructors/finalizers.
-  auto finiCB = [&](InsertPointTy codeGenIP) {};
+  auto finiCB = [&](InsertPointTy codeGenIP) {
+    InsertPointTy oldIP = builder.saveIP();
+    builder.restoreIP(codeGenIP);
+
+    // if the reduction has a cleanup region, inline it here to finalize the
+    // reduction variables
+    if (failed(inlineReductionCleanup(reductionDecls, privateReductionVariables,
+                                      moduleTranslation, builder)))
+      bodyGenStatus = failure();
+
+    builder.restoreIP(oldIP);
+  };
 
   llvm::Value *ifCond = nullptr;
   if (auto ifExprVar = opInst.getIfExprVar())
diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
index 80e3b7901632..abe565ea862f 100644
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -202,6 +202,7 @@ private:
   /// Contains the reaching definition at this operation. Reaching definitions
   /// are only computed for promotable memory operations with blocking uses.
   DenseMap<PromotableMemOpInterface, Value> reachingDefs;
+  DenseMap<PromotableMemOpInterface, Value> replacedValuesMap;
   DominanceInfo &dominance;
   MemorySlotPromotionInfo info;
   const Mem2RegStatistics &statistics;
@@ -438,6 +439,7 @@ Value MemorySlotPromoter::computeReachingDefInBlock(Block *block,
         assert(stored && "a memory operation storing to a slot must provide a "
                          "new definition of the slot");
         reachingDef = stored;
+        replacedValuesMap[memOp] = stored;
       }
     }
   }
@@ -552,6 +554,10 @@ void MemorySlotPromoter::removeBlockingUses() {
   dominanceSort(usersToRemoveUses, *slot.ptr.getParentBlock()->getParent());
 
   llvm::SmallVector<Operation *> toErase;
+  // List of all replaced values in the slot.
+  llvm::SmallVector<std::pair<Operation *, Value>> replacedValuesList;
+  // Ops to visit with the `visitReplacedValues` method.
+  llvm::SmallVector<PromotableOpInterface> toVisit;
   for (Operation *toPromote : llvm::reverse(usersToRemoveUses)) {
     if (auto toPromoteMemOp = dyn_cast<PromotableMemOpInterface>(toPromote)) {
       Value reachingDef = reachingDefs.lookup(toPromoteMemOp);
@@ -565,7 +571,9 @@ void MemorySlotPromoter::removeBlockingUses() {
               slot, info.userToBlockingUses[toPromote], rewriter,
               reachingDef) == DeletionKind::Delete)
         toErase.push_back(toPromote);
-
+      if (toPromoteMemOp.storesTo(slot))
+        if (Value replacedValue = replacedValuesMap[toPromoteMemOp])
+          replacedValuesList.push_back({toPromoteMemOp, replacedValue});
       continue;
     }
 
@@ -574,6 +582,12 @@ void MemorySlotPromoter::removeBlockingUses() {
     if (toPromoteBasic.removeBlockingUses(info.userToBlockingUses[toPromote],
                                           rewriter) == DeletionKind::Delete)
       toErase.push_back(toPromote);
+    if (toPromoteBasic.requiresReplacedValues())
+      toVisit.push_back(toPromoteBasic);
+  }
+  for (PromotableOpInterface op : toVisit) {
+    rewriter.setInsertionPointAfter(op);
+    op.visitReplacedValues(replacedValuesList, rewriter);
   }
 
   for (Operation *toEraseOp : toErase)
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir
index 17eec5936918..ad65410e635e 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir
@@ -15,3 +15,15 @@ func.func @tensor_with_unknown_rank(%arg0: tensor<*xi8>) -> tensor<*xi8> {
   %0 = "tosa.abs"(%arg0) : (tensor<*xi8>) -> tensor<*xi8>
   return %0 : tensor<*xi8>
 }
+
+// -----
+
+// CHECK-LABEL: @unranked_add
+func.func @unranked_add(%arg0 : tensor<10x10xf32> , %arg1 : tensor<10x10xf32>, %arg2 : tensor<*xf32>) -> (tensor<10x10xf32>) {
+  // expected-error@+3 {{failed to legalize operation 'tosa.add'}}
+  %reduce = tosa.reduce_max %arg0 {axis = 1 : i32} : (tensor<10x10xf32>) -> tensor<10x1xf32>
+  %1 = tosa.add %reduce, %arg1 : (tensor<10x1xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
+  %0 = tosa.add %1, %arg2 : (tensor<10x10xf32>, tensor<*xf32>) -> tensor<*xf32>
+  %2 = tosa.reshape %0 {new_shape = array<i64: 10, 10>} : (tensor<*xf32>) -> tensor<10x10xf32>
+  return %2 : tensor<10x10xf32>
+}
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir b/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
index b53fc55fdac9..d998ed874f41 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -affine-super-vectorizer-test -compose-maps -split-input-file 2>&1 |  FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -affine-super-vectorizer-test=compose-maps -split-input-file 2>&1 |  FileCheck %s
 
 // For all these cases, the test traverses the `test_affine_map` ops and
 // composes them in order one-by-one.
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
index e58de9fdaa76..bd71164244c0 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s -affine-super-vectorizer-test -vector-shape-ratio 4 -vector-shape-ratio 8 2>&1 | FileCheck %s
-// RUN: mlir-opt %s -affine-super-vectorizer-test -vector-shape-ratio 2 -vector-shape-ratio 5 -vector-shape-ratio 2 2>&1 | FileCheck %s -check-prefix=TEST-3x4x5x8
-// RUN: mlir-opt %s -affine-super-vectorizer-test -vectorize-affine-loop-nest 2>&1 | FileCheck %s -check-prefix=VECNEST
+// RUN: mlir-opt %s -affine-super-vectorizer-test="vector-shape-ratio=4,8" 2>&1 | FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorizer-test="vector-shape-ratio=2,5,2" 2>&1 | FileCheck %s -check-prefix=TEST-3x4x5x8
+// RUN: mlir-opt %s -affine-super-vectorizer-test=vectorize-affine-loop-nest 2>&1 | FileCheck %s -check-prefix=VECNEST
 
 func.func @vector_add_2d(%arg0: index, %arg1: index) -> f32 {
   // Nothing should be matched in this first block.
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_unsupported.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_unsupported.mlir
index c117bfc82ca0..6c1a7c48c4cb 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_unsupported.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_unsupported.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -affine-super-vectorizer-test  -vectorize-affine-loop-nest -split-input-file 2>&1 |  FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorizer-test=vectorize-affine-loop-nest -split-input-file 2>&1 |  FileCheck %s
 
 func.func @unparallel_loop_reduction_unsupported(%in: memref<256x512xf32>, %out: memref<256xf32>) {
  // CHECK: Outermost loop cannot be parallel
diff --git a/mlir/test/Dialect/Affine/loop-coalescing.mlir b/mlir/test/Dialect/Affine/loop-coalescing.mlir
index 9c17fb24be69..ae0adf5a0a02 100644
--- a/mlir/test/Dialect/Affine/loop-coalescing.mlir
+++ b/mlir/test/Dialect/Affine/loop-coalescing.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -affine-loop-coalescing --cse %s | FileCheck %s
 
 // CHECK-LABEL: @one_3d_nest
 func.func @one_3d_nest() {
@@ -239,19 +239,15 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
   }
   return
 }
-// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK-DAG: %[[T3:.*]] = affine.apply #[[IDENTITY]]()[%[[T0]]]
-// CHECK-DAG: %[[T4:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]]
-// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]]
-// CHECK-DAG: %[[T6:.*]] = affine.apply #[[IDENTITY]]()[%[[T2]]]
-// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]]
-// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]]
-// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]]
-// CHECK-DAG:    %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T6]]]
-// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]]
-// CHECK-DAG:    %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T4]]]
+// CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
+// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]]
+// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T0]]]
+// CHECK: affine.for %[[IV:.*]] = 0 to %[[T2]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T0]]]
+// CHECK-DAG:    %[[T9:.*]] = affine.apply #[[FLOOR]](%[[IV]])[%[[T0]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T0]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply #[[FLOOR]](%[[T9]])[%[[T0]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
@@ -277,18 +273,16 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
   }
   return
 }
-// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK-DAG: %[[T2:.*]] = affine.apply #[[IDENTITY]]()[%[[T0]]]
-// CHECK-DAG: %[[T3:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]]
-// CHECK-DAG: %[[T4:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T3]]]
-// CHECK-DAG: %[[T5:.*]] = affine.apply #[[SIXTY_FOUR]]()
-// CHECK-DAG: %[[T6:.*]] = affine.apply #[[PRODUCT]](%[[T4]])[%[[T5]]]
-// CHECK: affine.for %[[IV:.*]] = 0 to %[[T6]]
-// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T5]]]
-// CHECK-DAG:    %[[T8:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T5]]]
-// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T8]])[%[[T3]]]
-// CHECK-DAG:    %[[I:.*]] = affine.apply #[[DIV]](%[[T8]])[%[[T3]]]
+// CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK-DAG: %[[T0:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
+// CHECK-DAG: %[[T1:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T0]]]
+// CHECK-DAG: %[[T2:.*]] = affine.apply #[[SIXTY_FOUR]]()
+// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T1]])[%[[T2]]]
+// CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T2]]]
+// CHECK-DAG:    %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T2]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T0]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T0]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
@@ -316,19 +310,16 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
  }
  return
 }
-// CHECK: %[[T0:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK: %[[T1:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK: %[[T2:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
-// CHECK-DAG: %[[T3:.*]] = affine.min #[[MAP0]]()[%[[T0]]]
-// CHECK-DAG: %[[T4:.*]] = affine.apply #[[IDENTITY]]()[%[[T1]]]
-// CHECK-DAG: %[[T5:.*]] = affine.apply #[[PRODUCT]](%[[T3]])[%[[T4]]]
-// CHECK-DAG: %[[T6:.*]] = affine.apply #[[IDENTITY]]()[%[[T2]]]
-// CHECK-DAG: %[[T7:.*]] = affine.apply #[[PRODUCT]](%[[T5]])[%[[T6]]]
-// CHECK: affine.for %[[IV:.*]] = 0 to %[[T7]]
-// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T6]]]
-// CHECK-DAG:    %[[T9:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T6]]]
-// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T9]])[%[[T4]]]
-// CHECK-DAG:    %[[I:.*]] = affine.apply #[[DIV]](%[[T9]])[%[[T4]]]
+// CHECK: %[[DIM:.*]] = memref.dim %arg{{.*}}, %c{{.*}} : memref<?x?xf32>
+// CHECK-DAG: %[[T0:.*]] = affine.min #[[MAP0]]()[%[[DIM]]]
+// CHECK-DAG: %[[T1:.*]] = affine.apply #[[IDENTITY]]()[%[[DIM]]]
+// CHECK-DAG: %[[T2:.*]] = affine.apply #[[PRODUCT]](%[[T0]])[%[[T1]]]
+// CHECK-DAG: %[[T3:.*]] = affine.apply #[[PRODUCT]](%[[T2]])[%[[T1]]]
+// CHECK: affine.for %[[IV:.*]] = 0 to %[[T3]]
+// CHECK-DAG:    %[[K:.*]] = affine.apply #[[MOD]](%[[IV]])[%[[T1]]]
+// CHECK-DAG:    %[[T5:.*]] = affine.apply #[[DIV]](%[[IV]])[%[[T1]]]
+// CHECK-DAG:    %[[J:.*]] = affine.apply #[[MOD]](%[[T5]])[%[[T1]]]
+// CHECK-DAG:    %[[I:.*]] = affine.apply #[[DIV]](%[[T5]])[%[[T1]]]
 // CHECK-NEXT:    "test.foo"(%[[I]], %[[J]], %[[K]])
 // CHECK-NEXT:  }
 // CHECK-NEXT:  return
@@ -342,12 +333,14 @@ func.func @coalesce_affine_for(%arg0: memref<?x?xf32>) {
 func.func @test_loops_do_not_get_coalesced() {
   affine.for %i = 0 to 7 {
     affine.for %j = #map0(%i) to min #map1(%i) {
+      "use"(%i, %j) : (index, index) -> ()
     }
   }
   return
 }
 // CHECK: affine.for %[[IV0:.*]] = 0 to 7
 // CHECK-NEXT: affine.for %[[IV1:.*]] = #[[MAP0]](%[[IV0]]) to min #[[MAP1]](%[[IV0]])
+// CHECK-NEXT:   "use"(%[[IV0]], %[[IV1]])
 // CHECK-NEXT: }
 // CHECK-NEXT: }
 // CHECK-NEXT: return
diff --git a/mlir/test/Dialect/Affine/loop-fusion-dependence-check.mlir b/mlir/test/Dialect/Affine/loop-fusion-dependence-check.mlir
index aa872b0f6911..2c53852a8cec 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-dependence-check.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-dependence-check.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -test-loop-fusion -test-loop-fusion-dependence-check -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -test-loop-fusion=test-loop-fusion-dependence-check -split-input-file -verify-diagnostics | FileCheck %s
 
 // -----
 
diff --git a/mlir/test/Dialect/Affine/loop-fusion-slice-computation.mlir b/mlir/test/Dialect/Affine/loop-fusion-slice-computation.mlir
index c303dd0c01ec..aa79ee26928e 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-slice-computation.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-slice-computation.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-loop-fusion -test-loop-fusion-slice-computation -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s -test-loop-fusion=test-loop-fusion-slice-computation -split-input-file -verify-diagnostics | FileCheck %s
 
 // -----
 
diff --git a/mlir/test/Dialect/Affine/loop-fusion-transformation.mlir b/mlir/test/Dialect/Affine/loop-fusion-transformation.mlir
index c8e0918e6bfd..4f4163a2bbd5 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-transformation.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-transformation.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -allow-unregistered-dialect -test-loop-fusion -test-loop-fusion-transformation -split-input-file -canonicalize | FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -test-loop-fusion=test-loop-fusion-transformation -split-input-file -canonicalize | FileCheck %s
 
 // CHECK-LABEL: func @slice_depth1_loop_nest() {
 func.func @slice_depth1_loop_nest() {
diff --git a/mlir/test/Dialect/Affine/slicing-utils.mlir b/mlir/test/Dialect/Affine/slicing-utils.mlir
index 71bd8ad4ef9d..74379978fdf8 100644
--- a/mlir/test/Dialect/Affine/slicing-utils.mlir
+++ b/mlir/test/Dialect/Affine/slicing-utils.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test -forward-slicing=true 2>&1 | FileCheck %s --check-prefix=FWD
-// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test -backward-slicing=true 2>&1 | FileCheck %s --check-prefix=BWD
-// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test -slicing=true 2>&1 | FileCheck %s --check-prefix=FWDBWD
+// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test="forward-slicing=true" 2>&1 | FileCheck %s --check-prefix=FWD
+// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test="backward-slicing=true" 2>&1 | FileCheck %s --check-prefix=BWD
+// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test="slicing=true" 2>&1 | FileCheck %s --check-prefix=FWDBWD
 
 ///   1       2      3      4
 ///   |_______|      |______|
diff --git a/mlir/test/Dialect/ControlFlow/ops.mlir b/mlir/test/Dialect/ControlFlow/ops.mlir
index 8453c2b7038f..c9317c761397 100644
--- a/mlir/test/Dialect/ControlFlow/ops.mlir
+++ b/mlir/test/Dialect/ControlFlow/ops.mlir
@@ -38,3 +38,16 @@ func.func @switch_i64(%flag : i64, %caseOperand : i32) {
   ^bb3(%bb3arg : i32):
     return
 }
+
+// CHECK-LABEL: func @switch_result_number
+func.func @switch_result_number(%arg0: i32) {
+  %0:2 = "test.op_with_two_results"() : () -> (i32, i32)
+  cf.switch %arg0 : i32, [
+    default: ^bb2,
+    0: ^bb1(%0#0 : i32)
+  ]
+  ^bb1(%1: i32):
+    return
+  ^bb2:
+    return
+}
diff --git a/mlir/test/Dialect/LLVMIR/mem2reg-dbginfo.mlir b/mlir/test/Dialect/LLVMIR/mem2reg-dbginfo.mlir
index f7ddb4a7abe5..b7cbd787f06e 100644
--- a/mlir/test/Dialect/LLVMIR/mem2reg-dbginfo.mlir
+++ b/mlir/test/Dialect/LLVMIR/mem2reg-dbginfo.mlir
@@ -29,6 +29,27 @@ llvm.func @basic_store_load(%arg0: i64) -> i64 {
   llvm.return %2 : i64
 }
 
+// CHECK-LABEL: llvm.func @multiple_store_load
+llvm.func @multiple_store_load(%arg0: i64) -> i64 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: = llvm.alloca
+  %1 = llvm.alloca %0 x i64 {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  // CHECK-NOT: llvm.intr.dbg.declare
+  llvm.intr.dbg.declare #di_local_variable = %1 : !llvm.ptr
+  // CHECK-NOT: llvm.store
+  llvm.store %arg0, %1 {alignment = 4 : i64} : i64, !llvm.ptr
+  // CHECK-NOT: llvm.intr.dbg.declare
+  llvm.intr.dbg.declare #di_local_variable = %1 : !llvm.ptr
+  // CHECK: llvm.intr.dbg.value #[[$VAR]] = %[[LOADED:.*]] : i64
+  // CHECK: llvm.intr.dbg.value #[[$VAR]] = %[[LOADED]] : i64
+  // CHECK-NOT: llvm.intr.dbg.value
+  // CHECK-NOT: llvm.intr.dbg.declare
+  // CHECK-NOT: llvm.store
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i64
+  // CHECK: llvm.return %[[LOADED]] : i64
+  llvm.return %2 : i64
+}
+
 // CHECK-LABEL: llvm.func @block_argument_value
 // CHECK-SAME: (%[[ARG0:.*]]: i64, {{.*}})
 llvm.func @block_argument_value(%arg0: i64, %arg1: i1) -> i64 {
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index a00383cf4405..1134db77d5ba 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -436,6 +436,25 @@ atomic {
 
 // -----
 
+// expected-error @below {{op expects cleanup region with one argument of the reduction type}}
+omp.declare_reduction @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = arith.constant 0.0 : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = arith.addf %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+cleanup {
+^bb0(%arg: f64):
+  omp.yield
+}
+
+// -----
+
 func.func @foo(%lb : index, %ub : index, %step : index) {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 30ce77423005..e2c255c7a3cc 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -603,6 +603,8 @@ func.func @omp_target_pretty(%if_cond : i1, %device : si32,  %num_threads : i32)
 // CHECK: atomic
 // CHECK: ^{{.+}}(%{{.+}}: !llvm.ptr, %{{.+}}: !llvm.ptr):
 // CHECK:  omp.yield
+// CHECK: cleanup
+// CHECK:  omp.yield
 omp.declare_reduction @add_f32 : f32
 init {
 ^bb0(%arg: f32):
@@ -620,6 +622,10 @@ atomic {
   llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
   omp.yield
 }
+cleanup {
+^bb0(%arg: f32):
+  omp.yield
+}
 
 // CHECK-LABEL: func @wsloop_reduction
 func.func @wsloop_reduction(%lb : index, %ub : index, %step : index) {
@@ -789,6 +795,7 @@ combiner {
   omp.yield (%1 : f32)
 }
 // CHECK-NOT: atomic
+// CHECK-NOT: cleanup
 
 // CHECK-LABEL: func @wsloop_reduction2
 func.func @wsloop_reduction2(%lb : index, %ub : index, %step : index) {
@@ -2088,6 +2095,7 @@ func.func @opaque_pointers_atomic_rwu(%v: !llvm.ptr, %x: !llvm.ptr) {
 // CHECK-LABEL: @opaque_pointers_reduction
 // CHECK: atomic {
 // CHECK-NEXT: ^{{[[:alnum:]]+}}(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr):
+// CHECK-NOT: cleanup
 omp.declare_reduction @opaque_pointers_reduction : f32
 init {
 ^bb0(%arg: f32):
diff --git a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
index 2d59331b72cf..4dc3e4ea0ef4 100644
--- a/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
+++ b/mlir/test/Dialect/SCF/transform-op-coalesce.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect --cse | FileCheck %s
 
 func.func @coalesce_inner() {
   %c0 = arith.constant 0 : index
@@ -14,7 +14,7 @@ func.func @coalesce_inner() {
       scf.for %k = %i to %j step %c1 {
         // Inner loop must have been removed.
         scf.for %l = %i to %j step %c1 {
-          arith.addi %i, %j : index
+          "use"(%i, %j) : (index, index) -> ()
         }
       } {coalesce}
     }
@@ -33,13 +33,19 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-DAG: #[[MAP:.+]] = affine_map<() -> (64)>
+// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 mod s0)>
+// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (d0 floordiv s0)>
 func.func @coalesce_outer(%arg1: memref<64x64xf32, 1>, %arg2: memref<64x64xf32, 1>, %arg3: memref<64x64xf32, 1>) attributes {} {
+  // CHECK: %[[T0:.+]] = affine.apply #[[MAP]]()
+  // CHECK: %[[UB:.+]] = affine.apply #[[MAP1]](%[[T0]])[%[[T0]]]
   // CHECK: affine.for %[[IV1:.+]] = 0 to %[[UB:.+]] {
   // CHECK-NOT: affine.for %[[IV2:.+]]
   affine.for %arg4 = 0 to 64 {
     affine.for %arg5 = 0 to 64 {
-      // CHECK: %[[IDX0:.+]] = affine.apply #[[MAP0:.+]](%[[IV1]])[%{{.+}}]
-      // CHECK: %[[IDX1:.+]] = affine.apply #[[MAP1:.+]](%[[IV1]])[%{{.+}}]
+      // CHECK: %[[IDX0:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%{{.+}}]
+      // CHECK: %[[IDX1:.+]] = affine.apply #[[MAP3]](%[[IV1]])[%{{.+}}]
       // CHECK-NEXT: %{{.+}} = affine.load %{{.+}}[%[[IDX1]], %[[IDX0]]] : memref<64x64xf32, 1>
       %0 = affine.load %arg1[%arg4, %arg5] : memref<64x64xf32, 1>
       %1 = affine.load %arg2[%arg4, %arg5] : memref<64x64xf32, 1>
@@ -96,3 +102,200 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func @tensor_loops(%arg0 : tensor<?x?xf32>, %lb0 : index, %ub0 : index, %step0 : index,
+    %lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> tensor<?x?xf32> {
+  %0 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg1 = %arg0) -> tensor<?x?xf32> {
+    %1 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg2 = %arg1) -> tensor<?x?xf32> {
+      %2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg3 = %arg2) -> tensor<?x?xf32> {
+        %3 = "use"(%arg3, %i, %j, %k) : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>)
+        scf.yield %3 : tensor<?x?xf32>
+      }
+      scf.yield %2 : tensor<?x?xf32>
+    }
+    scf.yield %1 : tensor<?x?xf32>
+  } {coalesce}
+  return %0 : tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
+    %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
+    transform.yield
+  }
+}
+//      CHECK: func.func @tensor_loops(
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[LB0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[LB1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[LB2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
+//      CHECK:   %[[NEWUB0_DIFF:.+]] = arith.subi %[[UB0]], %[[LB0]]
+//  CHECK-DAG:   %[[NEWUB0:.+]] = arith.ceildivsi %[[NEWUB0_DIFF]], %[[STEP0]]
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1
+//      CHECK:   %[[NEWUB1_DIFF:.+]] = arith.subi %[[UB1]], %[[LB1]]
+//  CHECK-DAG:   %[[NEWUB1:.+]] = arith.ceildivsi %[[NEWUB1_DIFF]], %[[STEP1]]
+//      CHECK:   %[[NEWUB2_DIFF:.+]] = arith.subi %[[UB2]], %[[LB2]]
+//  CHECK-DAG:   %[[NEWUB2:.+]] = arith.ceildivsi %[[NEWUB2_DIFF]], %[[STEP2]]
+//      CHECK:   %[[PROD1:.+]] = arith.muli %[[NEWUB0]], %[[NEWUB1]]
+//      CHECK:   %[[NEWUB:.+]] = arith.muli %[[PROD1]], %[[NEWUB2]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[NEWUB]] step %[[C1]] iter_args(%[[ITER_ARG:.+]] = %[[ARG0]])
+//      CHECK:     %[[IV2:.+]] = arith.remsi %[[IV]], %[[NEWUB2]]
+//      CHECK:     %[[PREVIOUS:.+]] = arith.divsi %[[IV]], %[[NEWUB2]]
+//      CHECK:     %[[IV1:.+]] = arith.remsi %[[PREVIOUS]], %[[NEWUB1]]
+//      CHECK:     %[[IV0:.+]] = arith.divsi %[[PREVIOUS]], %[[NEWUB1]]
+//      CHECK:     %[[K_STEP:.+]] = arith.muli %[[IV2]], %[[STEP2]]
+//      CHECK:     %[[K:.+]] = arith.addi %[[K_STEP]], %[[LB2]]
+//      CHECK:     %[[J_STEP:.+]] = arith.muli %[[IV1]], %[[STEP1]]
+//      CHECK:     %[[J:.+]] = arith.addi %[[J_STEP]], %[[LB1]]
+//      CHECK:     %[[I_STEP:.+]] = arith.muli %[[IV0]], %[[STEP0]]
+//      CHECK:     %[[I:.+]] = arith.addi %[[I_STEP]], %[[LB0]]
+//      CHECK:     %[[USE:.+]] = "use"(%[[ITER_ARG]], %[[I]], %[[J]], %[[K]])
+//      CHECK:     scf.yield %[[USE]]
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+// Coalesce only first two loops, but not the last since the iter_args dont line up
+func.func @tensor_loops_first_two(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %lb0 : index, %ub0 : index, %step0 : index,
+    %lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+  %0:2 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+    %1:2 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+      %2:2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg6 = %arg5, %arg7 = %arg4) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+        %3:2 = "use"(%arg3, %i, %j, %k) : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>, tensor<?x?xf32>)
+        scf.yield %3#0, %3#1 : tensor<?x?xf32>, tensor<?x?xf32>
+      }
+      scf.yield %2#0, %2#1 : tensor<?x?xf32>, tensor<?x?xf32>
+    }
+    scf.yield %1#0, %1#1 : tensor<?x?xf32>, tensor<?x?xf32>
+  } {coalesce}
+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
+    %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
+    transform.yield
+  }
+}
+//      CHECK: func.func @tensor_loops_first_two(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[LB0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[LB1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[LB2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
+//      CHECK:   scf.for
+//      CHECK:     arith.remsi
+//      CHECK:     arith.divsi
+//      CHECK:     scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]]
+//  CHECK-NOT:       scf.for
+//      CHECK:   transform.named_sequence
+
+// -----
+
+// Coalesce only first two loops, but not the last since the yields dont match up
+func.func @tensor_loops_first_two_2(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %lb0 : index, %ub0 : index, %step0 : index,
+    %lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+  %0:2 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+    %1:2 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+      %2:2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg6 = %arg4, %arg7 = %arg5) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+        %3:2 = "use"(%arg3, %i, %j, %k) : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>, tensor<?x?xf32>)
+        scf.yield %3#0, %3#1 : tensor<?x?xf32>, tensor<?x?xf32>
+      }
+      scf.yield %2#1, %2#0 : tensor<?x?xf32>, tensor<?x?xf32>
+    }
+    scf.yield %1#0, %1#1 : tensor<?x?xf32>, tensor<?x?xf32>
+  } {coalesce}
+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
+    %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
+    transform.yield
+  }
+}
+//      CHECK: func.func @tensor_loops_first_two_2(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[LB0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[LB1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[LB2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
+//      CHECK:   scf.for
+//      CHECK:     arith.remsi
+//      CHECK:     arith.divsi
+//      CHECK:     scf.for %{{[a-zA-Z0-9]+}} = %[[LB2]] to %[[UB2]] step %[[STEP2]]
+//  CHECK-NOT:       scf.for
+//      CHECK:   transform.named_sequence
+
+// -----
+
+// Coalesce only last two loops, but not the first since the yields dont match up
+func.func @tensor_loops_last_two(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %lb0 : index, %ub0 : index, %step0 : index,
+    %lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+  %0:2 = scf.for %i = %lb0 to %ub0 step %step0 iter_args(%arg2 = %arg0, %arg3 = %arg1) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+    %1:2 = scf.for %j = %lb1 to %ub1 step %step1 iter_args(%arg4 = %arg2, %arg5 = %arg3) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+      %2:2 = scf.for %k = %lb2 to %ub2 step %step2 iter_args(%arg6 = %arg4, %arg7 = %arg5) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+        %3:2 = "use"(%arg3, %i, %j, %k) : (tensor<?x?xf32>, index, index, index) -> (tensor<?x?xf32>, tensor<?x?xf32>)
+        scf.yield %3#0, %3#1 : tensor<?x?xf32>, tensor<?x?xf32>
+      }
+      scf.yield %2#0, %2#1 : tensor<?x?xf32>, tensor<?x?xf32>
+    }
+    scf.yield %1#1, %1#0 : tensor<?x?xf32>, tensor<?x?xf32>
+  } {coalesce}
+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["scf.for"]} attributes {coalesce} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.for">
+    %2 = transform.loop.coalesce %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">)
+    transform.yield
+  }
+}
+//      CHECK: func.func @tensor_loops_last_two(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[LB0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP0:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[LB1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[LB2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[UB2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[STEP2:[a-zA-Z0-9_]+]]: index
+//      CHECK:   scf.for %{{[a-zA-Z0-9]+}} = %[[LB0]] to %[[UB0]] step %[[STEP0]]
+//      CHECK:     arith.subi
+//      CHECK:     arith.ceildivsi
+//      CHECK:     arith.subi
+//      CHECK:     arith.ceildivsi
+//      CHECK:     scf.for
+//      CHECK:       arith.remsi
+//      CHECK:       arith.divsi
+//  CHECK-NOT:       scf.for
+//      CHECK:   transform.named_sequence
+
diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir
new file mode 100644
index 000000000000..9ae4c4ad392b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir
@@ -0,0 +1,94 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// test a parallel reduction with a cleanup region
+
+  omp.declare_reduction @add_reduction_i_32 : !llvm.ptr init {
+  ^bb0(%arg0: !llvm.ptr):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %c4 = llvm.mlir.constant(4 : i64) : i64
+    %2 = llvm.call @malloc(%c4) : (i64) -> !llvm.ptr
+    llvm.store %0, %2 : i32, !llvm.ptr
+    omp.yield(%2 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.load %arg0 : !llvm.ptr -> i32
+    %1 = llvm.load %arg1 : !llvm.ptr -> i32
+    %2 = llvm.add %0, %1  : i32
+    llvm.store %2, %arg0 : i32, !llvm.ptr
+    omp.yield(%arg0 : !llvm.ptr)
+  } cleanup {
+  ^bb0(%arg0: !llvm.ptr):
+    llvm.call @free(%arg0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+
+  // CHECK-LABEL: @main
+  llvm.func @main()  {
+    %0 = llvm.mlir.constant(-1 : i32) : i32
+    %1 = llvm.mlir.addressof @i : !llvm.ptr
+    %2 = llvm.mlir.addressof @j : !llvm.ptr
+    omp.parallel byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) {
+      llvm.store %0, %arg0 : i32, !llvm.ptr
+      llvm.store %0, %arg1 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.mlir.global internal @j() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.func @malloc(%arg0 : i64) -> !llvm.ptr
+  llvm.func @free(%arg0 : !llvm.ptr) -> ()
+
+// CHECK: %{{.+}} = 
+// Call to the outlined function.
+// CHECK: call void {{.*}} @__kmpc_fork_call
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %tid.addr.local = alloca i32
+// CHECK: %[[MALLOC_I:.+]] = call ptr @malloc(i64 4)
+// CHECK: %[[PRIV_PTR_I:.+]] = alloca ptr
+// CHECK: store ptr %[[MALLOC_I]], ptr %[[PRIV_PTR_I]]
+// CHECK: %[[MALLOC_J:.+]] = call ptr @malloc(i64 4)
+// CHECK: %[[PRIV_PTR_J:.+]] = alloca ptr
+// CHECK: store ptr %[[MALLOC_J]], ptr %[[PRIV_PTR_J]]
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+
+// Non-atomic reduction:
+// CHECK: %[[PRIV_VAL_PTR_I:.+]] = load ptr, ptr %[[PRIV_PTR_I]]
+// CHECK: %[[LOAD_I:.+]] = load i32, ptr @i
+// CHECK: %[[PRIV_VAL_I:.+]] = load i32, ptr %[[PRIV_VAL_PTR_I]]
+// CHECK: %[[SUM_I:.+]] = add i32 %[[LOAD_I]], %[[PRIV_VAL_I]]
+// CHECK: store i32 %[[SUM_I]], ptr @i
+// CHECK: %[[PRIV_VAL_PTR_J:.+]] = load ptr, ptr %[[PRIV_PTR_J]]
+// CHECK: %[[LOAD_J:.+]] = load i32, ptr @j
+// CHECK: %[[PRIV_VAL_J:.+]] = load i32, ptr %[[PRIV_VAL_PTR_J]]
+// CHECK: %[[SUM_J:.+]] = add i32 %[[LOAD_J]], %[[PRIV_VAL_J]]
+// CHECK: store i32 %[[SUM_J]], ptr @j
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+// CHECK: br label %[[OMP_FINALIZE:.+]]
+
+// Cleanup region:
+// CHECK: [[OMP_FINALIZE]]:
+// CHECK: call void @free(ptr %[[PRIV_PTR_I]])
+// CHECK: call void @free(ptr %[[PRIV_PTR_J]])
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: add i32
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
new file mode 100644
index 000000000000..5dd31c425566
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
@@ -0,0 +1,111 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Test that the block argument to the initialization region of
+// omp.declare_reduction gets mapped properly when translating to LLVMIR.
+
+module {
+  omp.declare_reduction @add_reduction_byref_box_Uxf64 : !llvm.ptr init {
+  ^bb0(%arg0: !llvm.ptr):
+// test usage of %arg0:
+    %11 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    omp.yield(%arg0 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    omp.yield(%arg0 : !llvm.ptr)
+  }
+
+  llvm.func internal @_QFPreduce(%arg0: !llvm.ptr {fir.bindc_name = "r"}, %arg1: !llvm.ptr {fir.bindc_name = "r2"}) attributes {sym_visibility = "private"} {
+  %8 = llvm.mlir.constant(1 : i32) : i32
+  %9 = llvm.mlir.constant(10 : i32) : i32
+  %10 = llvm.mlir.constant(0 : i32) : i32
+  %83 = llvm.mlir.constant(1 : i64) : i64
+  %84 = llvm.alloca %83 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+  %86 = llvm.mlir.constant(1 : i64) : i64
+  %87 = llvm.alloca %86 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+// test multiple reduction variables to ensure they don't intefere with eachother
+// when inlining the reduction init region multiple times
+    omp.parallel byref reduction(@add_reduction_byref_box_Uxf64 %84 -> %arg3 : !llvm.ptr, @add_reduction_byref_box_Uxf64 %87 -> %arg4 : !llvm.ptr) {
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK-LABEL: define internal void @_QFPreduce
+// CHECK:         %[[VAL_0:.*]] = alloca { ptr, ptr }, align 8
+// CHECK:         %[[VAL_1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK:         %[[VAL_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK:         br label %[[VAL_3:.*]]
+// CHECK:       entry:                                            ; preds = %[[VAL_4:.*]]
+// CHECK:         %[[VAL_5:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         br label %[[VAL_6:.*]]
+// CHECK:       omp_parallel:                                     ; preds = %[[VAL_3]]
+// CHECK:         %[[VAL_7:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_0]], i32 0, i32 0
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_7]], align 8
+// CHECK:         %[[VAL_8:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_0]], i32 0, i32 1
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_8]], align 8
+// CHECK:         call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @_QFPreduce..omp_par, ptr %[[VAL_0]])
+// CHECK:         br label %[[VAL_9:.*]]
+// CHECK:       omp.par.outlined.exit:                            ; preds = %[[VAL_6]]
+// CHECK:         br label %[[VAL_10:.*]]
+// CHECK:       omp.par.exit.split:                               ; preds = %[[VAL_9]]
+// CHECK:         ret void
+// CHECK:       omp.par.entry:
+// CHECK:         %[[VAL_11:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_12:.*]], i32 0, i32 0
+// CHECK:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8
+// CHECK:         %[[VAL_14:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_12]], i32 0, i32 1
+// CHECK:         %[[VAL_15:.*]] = load ptr, ptr %[[VAL_14]], align 8
+// CHECK:         %[[VAL_16:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_17:.*]] = load i32, ptr %[[VAL_18:.*]], align 4
+// CHECK:         store i32 %[[VAL_17]], ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_19:.*]] = load i32, ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_20:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_13]], align 8
+// CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
+// CHECK:         store ptr %[[VAL_13]], ptr %[[VAL_21]], align 8
+// CHECK:         %[[VAL_22:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_15]], align 8
+// CHECK:         %[[VAL_23:.*]] = alloca ptr, align 8
+// CHECK:         store ptr %[[VAL_15]], ptr %[[VAL_23]], align 8
+// CHECK:         %[[VAL_24:.*]] = alloca [2 x ptr], align 8
+// CHECK:         br label %[[VAL_25:.*]]
+// CHECK:       omp.par.region:                                   ; preds = %[[VAL_26:.*]]
+// CHECK:         br label %[[VAL_27:.*]]
+// CHECK:       omp.par.region1:                                  ; preds = %[[VAL_25]]
+// CHECK:         br label %[[VAL_28:.*]]
+// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_27]]
+// CHECK:         %[[VAL_29:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_24]], i64 0, i64 0
+// CHECK:         store ptr %[[VAL_21]], ptr %[[VAL_29]], align 8
+// CHECK:         %[[VAL_30:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_24]], i64 0, i64 1
+// CHECK:         store ptr %[[VAL_23]], ptr %[[VAL_30]], align 8
+// CHECK:         %[[VAL_31:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_32:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_31]], i32 2, i64 16, ptr %[[VAL_24]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         switch i32 %[[VAL_32]], label %[[VAL_33:.*]] [
+// CHECK:           i32 1, label %[[VAL_34:.*]]
+// CHECK:           i32 2, label %[[VAL_35:.*]]
+// CHECK:         ]
+// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_28]]
+// CHECK:         unreachable
+// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_28]]
+// CHECK:         %[[VAL_36:.*]] = load ptr, ptr %[[VAL_21]], align 8
+// CHECK:         %[[VAL_37:.*]] = load ptr, ptr %[[VAL_23]], align 8
+// CHECK:         call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_31]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         br label %[[VAL_33]]
+// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_34]], %[[VAL_28]]
+// CHECK:         br label %[[VAL_38:.*]]
+// CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_33]]
+// CHECK:         br label %[[VAL_39:.*]]
+// CHECK:       omp.par.outlined.exit.exitStub:                   ; preds = %[[VAL_38]]
+// CHECK:         ret void
+// CHECK:         %[[VAL_40:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_41:.*]], i64 0, i64 0
+// CHECK:         %[[VAL_42:.*]] = load ptr, ptr %[[VAL_40]], align 8
+// CHECK:         %[[VAL_43:.*]] = load ptr, ptr %[[VAL_42]], align 8
+// CHECK:         %[[VAL_44:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_45:.*]], i64 0, i64 0
+// CHECK:         %[[VAL_46:.*]] = load ptr, ptr %[[VAL_44]], align 8
+// CHECK:         %[[VAL_47:.*]] = load ptr, ptr %[[VAL_46]], align 8
+// CHECK:         %[[VAL_48:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_41]], i64 0, i64 1
+// CHECK:         %[[VAL_49:.*]] = load ptr, ptr %[[VAL_48]], align 8
+// CHECK:         %[[VAL_50:.*]] = load ptr, ptr %[[VAL_49]], align 8
+// CHECK:         %[[VAL_51:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_45]], i64 0, i64 1
+// CHECK:         %[[VAL_52:.*]] = load ptr, ptr %[[VAL_51]], align 8
+// CHECK:         %[[VAL_53:.*]] = load ptr, ptr %[[VAL_52]], align 8
+// CHECK:         ret void
+
diff --git a/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
new file mode 100644
index 000000000000..a1e17afa53e2
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
@@ -0,0 +1,86 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// test a wsloop reduction with a cleanup region
+
+  omp.declare_reduction @add_reduction_i_32 : !llvm.ptr init {
+  ^bb0(%arg0: !llvm.ptr):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %c4 = llvm.mlir.constant(4 : i64) : i64
+    %2 = llvm.call @malloc(%c4) : (i64) -> !llvm.ptr
+    llvm.store %0, %2 : i32, !llvm.ptr
+    omp.yield(%2 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.load %arg0 : !llvm.ptr -> i32
+    %1 = llvm.load %arg1 : !llvm.ptr -> i32
+    %2 = llvm.add %0, %1  : i32
+    llvm.store %2, %arg0 : i32, !llvm.ptr
+    omp.yield(%arg0 : !llvm.ptr)
+  } cleanup {
+  ^bb0(%arg0: !llvm.ptr):
+    llvm.call @free(%arg0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+
+  // CHECK-LABEL: @main
+  llvm.func @main()  {
+    %0 = llvm.mlir.constant(-1 : i32) : i32
+    %1 = llvm.mlir.addressof @i : !llvm.ptr
+    %2 = llvm.mlir.addressof @j : !llvm.ptr
+    %loop_ub = llvm.mlir.constant(9 : i32) : i32
+    %loop_lb = llvm.mlir.constant(0 : i32) : i32
+    %loop_step = llvm.mlir.constant(1 : i32) : i32 
+    omp.wsloop byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) for (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+      llvm.store %0, %arg0 : i32, !llvm.ptr
+      llvm.store %0, %arg1 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.mlir.global internal @j() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.func @malloc(%arg0 : i64) -> !llvm.ptr
+  llvm.func @free(%arg0 : !llvm.ptr) -> ()
+
+// Private reduction variable and its initialization.
+// CHECK: %[[MALLOC_I:.+]] = call ptr @malloc(i64 4)
+// CHECK: %[[PRIV_PTR_I:.+]] = alloca ptr
+// CHECK: store ptr %[[MALLOC_I]], ptr %[[PRIV_PTR_I]]
+// CHECK: %[[MALLOC_J:.+]] = call ptr @malloc(i64 4)
+// CHECK: %[[PRIV_PTR_J:.+]] = alloca ptr
+// CHECK: store ptr %[[MALLOC_J]], ptr %[[PRIV_PTR_J]]
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Weirdly the finalization block is generated before the reduction blocks:
+// CHECK: [[FINALIZE:.+]]:
+// CHECK: call void @__kmpc_barrier
+// CHECK: call void @free(ptr %[[PRIV_PTR_I]])
+// CHECK: call void @free(ptr %[[PRIV_PTR_J]])
+// CHECK: ret void
+
+// Non-atomic reduction:
+// CHECK: %[[PRIV_VAL_PTR_I:.+]] = load ptr, ptr %[[PRIV_PTR_I]]
+// CHECK: %[[LOAD_I:.+]] = load i32, ptr @i
+// CHECK: %[[PRIV_VAL_I:.+]] = load i32, ptr %[[PRIV_VAL_PTR_I]]
+// CHECK: %[[SUM_I:.+]] = add i32 %[[LOAD_I]], %[[PRIV_VAL_I]]
+// CHECK: store i32 %[[SUM_I]], ptr @i
+// CHECK: %[[PRIV_VAL_PTR_J:.+]] = load ptr, ptr %[[PRIV_PTR_J]]
+// CHECK: %[[LOAD_J:.+]] = load i32, ptr @j
+// CHECK: %[[PRIV_VAL_J:.+]] = load i32, ptr %[[PRIV_VAL_PTR_J]]
+// CHECK: %[[SUM_J:.+]] = add i32 %[[LOAD_J]], %[[PRIV_VAL_J]]
+// CHECK: store i32 %[[SUM_J]], ptr @j
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE]]
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: add i32
diff --git a/mlir/test/Transforms/parallel-loop-collapsing.mlir b/mlir/test/Transforms/parallel-loop-collapsing.mlir
index 660d7edb2fbb..d1c23d584f92 100644
--- a/mlir/test/Transforms/parallel-loop-collapsing.mlir
+++ b/mlir/test/Transforms/parallel-loop-collapsing.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(test-scf-parallel-loop-collapsing{collapsed-indices-0=0,3 collapsed-indices-1=1,4 collapsed-indices-2=2}, canonicalize))' | FileCheck %s
 
-// CHECK-LABEL: func @parallel_many_dims() {
+// CHECK: func @parallel_many_dims() {
 func.func @parallel_many_dims() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -28,19 +28,19 @@ func.func @parallel_many_dims() {
   return
 }
 
-// CHECK-DAG: [[C12:%.*]] = arith.constant 12 : index
-// CHECK-DAG: [[C10:%.*]] = arith.constant 10 : index
-// CHECK-DAG: [[C9:%.*]] = arith.constant 9 : index
-// CHECK-DAG: [[C6:%.*]] = arith.constant 6 : index
-// CHECK-DAG: [[C4:%.*]] = arith.constant 4 : index
-// CHECK-DAG: [[C3:%.*]] = arith.constant 3 : index
-// CHECK-DAG: [[C2:%.*]] = arith.constant 2 : index
-// CHECK-DAG: [[C1:%.*]] = arith.constant 1 : index
-// CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index
-// CHECK: scf.parallel ([[NEW_I0:%.*]]) = ([[C0]]) to ([[C4]]) step ([[C1]]) {
-// CHECK:   [[V0:%.*]] = arith.remsi [[NEW_I0]], [[C2]] : index
-// CHECK:   [[I0:%.*]] = arith.divsi [[NEW_I0]], [[C2]] : index
-// CHECK:   [[V2:%.*]] = arith.muli [[V0]], [[C10]] : index
-// CHECK:   [[I3:%.*]] = arith.addi [[V2]], [[C9]] : index
-// CHECK:   "magic.op"([[I0]], [[C3]], [[C6]], [[I3]], [[C12]]) : (index, index, index, index, index) -> index
+// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
+// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index
+// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index
+// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index
+// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+// CHECK: scf.parallel (%[[NEW_I0:.*]]) = (%[[C0]]) to (%[[C4]]) step (%[[C1]]) {
+// CHECK:   %[[V0:.*]] = arith.remsi %[[NEW_I0]], %[[C2]] : index
+// CHECK:   %[[I0:.*]] = arith.divsi %[[NEW_I0]], %[[C2]] : index
+// CHECK:   %[[V2:.*]] = arith.muli %[[V0]], %[[C10]]
+// CHECK:   %[[I3:.*]] = arith.addi %[[V2]], %[[C9]]
+// CHECK:   "magic.op"(%[[I0]], %[[C3]], %[[C6]], %[[I3]], %[[C12]]) : (index, index, index, index, index) -> index
 // CHECK:   scf.reduce
diff --git a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
index 542786b5fa5e..4eed61a65aa4 100644
--- a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
+++ b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir
@@ -13,22 +13,22 @@ func.func @collapse_to_single() {
   return
 }
 
-// CHECK-LABEL: func @collapse_to_single() {
-// CHECK-DAG:         [[C18:%.*]] = arith.constant 18 : index
-// CHECK-DAG:         [[C6:%.*]] = arith.constant 6 : index
-// CHECK-DAG:         [[C3:%.*]] = arith.constant 3 : index
-// CHECK-DAG:         [[C7:%.*]] = arith.constant 7 : index
-// CHECK-DAG:         [[C4:%.*]] = arith.constant 4 : index
-// CHECK-DAG:         [[C1:%.*]] = arith.constant 1 : index
-// CHECK-DAG:         [[C0:%.*]] = arith.constant 0 : index
-// CHECK:         scf.parallel ([[NEW_I:%.*]]) = ([[C0]]) to ([[C18]]) step ([[C1]]) {
-// CHECK:           [[I0_COUNT:%.*]] = arith.remsi [[NEW_I]], [[C6]] : index
-// CHECK:           [[I1_COUNT:%.*]] = arith.divsi [[NEW_I]], [[C6]] : index
-// CHECK:           [[V0:%.*]] = arith.muli [[I0_COUNT]], [[C4]] : index
-// CHECK:           [[I1:%.*]] = arith.addi [[V0]], [[C7]] : index
-// CHECK:           [[V1:%.*]] = arith.muli [[I1_COUNT]], [[C3]] : index
-// CHECK:           [[I0:%.*]] = arith.addi [[V1]], [[C3]] : index
-// CHECK:           "magic.op"([[I0]], [[I1]]) : (index, index) -> index
+// CHECK: func @collapse_to_single() {
+// CHECK-DAG:         %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:         %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:         %[[C3:.*]] = arith.constant 3 : index
+// CHECK-DAG:         %[[C7:.*]] = arith.constant 7 : index
+// CHECK-DAG:         %[[C4:.*]] = arith.constant 4 : index
+// CHECK-DAG:         %[[C6:.*]] = arith.constant 6 : index
+// CHECK-DAG:         %[[C18:.*]] = arith.constant 18 : index
+// CHECK:         scf.parallel (%[[NEW_I:.*]]) = (%[[C0]]) to (%[[C18]]) step (%[[C1]]) {
+// CHECK:           %[[I0_COUNT:.*]] = arith.remsi %[[NEW_I]], %[[C6]] : index
+// CHECK:           %[[I1_COUNT:.*]] = arith.divsi %[[NEW_I]], %[[C6]] : index
+// CHECK:            %[[V0:.*]] = arith.muli %[[I0_COUNT]], %[[C4]]
+// CHECK:           %[[I1:.*]] = arith.addi %[[V0]], %[[C7]]
+// CHECK:            %[[V1:.*]] = arith.muli %[[I1_COUNT]], %[[C3]]
+// CHECK:           %[[I0:.*]] = arith.addi %[[V1]], %[[C3]]
+// CHECK:           "magic.op"(%[[I0]], %[[I1]]) : (index, index) -> index
 // CHECK:           scf.reduce
 // CHECK-NEXT:    }
 // CHECK-NEXT:    return
diff --git a/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp b/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
index f4f1593dc53e..19011803a793 100644
--- a/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
@@ -22,23 +22,6 @@
 using namespace mlir;
 using namespace mlir::affine;
 
-static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
-
-static llvm::cl::opt<bool> clTestDependenceCheck(
-    "test-loop-fusion-dependence-check",
-    llvm::cl::desc("Enable testing of loop fusion dependence check"),
-    llvm::cl::cat(clOptionsCategory));
-
-static llvm::cl::opt<bool> clTestSliceComputation(
-    "test-loop-fusion-slice-computation",
-    llvm::cl::desc("Enable testing of loop fusion slice computation"),
-    llvm::cl::cat(clOptionsCategory));
-
-static llvm::cl::opt<bool> clTestLoopFusionTransformation(
-    "test-loop-fusion-transformation",
-    llvm::cl::desc("Enable testing of loop fusion transformation"),
-    llvm::cl::cat(clOptionsCategory));
-
 namespace {
 
 struct TestLoopFusion
@@ -50,6 +33,24 @@ struct TestLoopFusion
     return "Tests loop fusion utility functions.";
   }
   void runOnOperation() override;
+
+  TestLoopFusion() = default;
+  TestLoopFusion(const TestLoopFusion &pass) : PassWrapper(pass){};
+
+  Option<bool> clTestDependenceCheck{
+      *this, "test-loop-fusion-dependence-check",
+      llvm::cl::desc("Enable testing of loop fusion dependence check"),
+      llvm::cl::init(false)};
+
+  Option<bool> clTestSliceComputation{
+      *this, "test-loop-fusion-slice-computation",
+      llvm::cl::desc("Enable testing of loop fusion slice computation"),
+      llvm::cl::init(false)};
+
+  Option<bool> clTestLoopFusionTransformation{
+      *this, "test-loop-fusion-transformation",
+      llvm::cl::desc("Enable testing of loop fusion transformation"),
+      llvm::cl::init(false)};
 };
 
 } // namespace
diff --git a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
index 5e160b720db6..4b2b1a06341b 100644
--- a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
@@ -117,14 +117,17 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp,
 
       // Prepare stop condition. By default, reify in terms of the op's
       // operands. No stop condition is used when a constant was requested.
-      std::function<bool(Value, std::optional<int64_t>)> stopCondition =
-          [&](Value v, std::optional<int64_t> d) {
+      std::function<bool(Value, std::optional<int64_t>,
+                         ValueBoundsConstraintSet & cstr)>
+          stopCondition = [&](Value v, std::optional<int64_t> d,
+                              ValueBoundsConstraintSet &cstr) {
             // Reify in terms of SSA values that are different from `value`.
             return v != value;
           };
       if (reifyToFuncArgs) {
         // Reify in terms of function block arguments.
-        stopCondition = stopCondition = [](Value v, std::optional<int64_t> d) {
+        stopCondition = [](Value v, std::optional<int64_t> d,
+                           ValueBoundsConstraintSet &cstr) {
           auto bbArg = dyn_cast<BlockArgument>(v);
           if (!bbArg)
             return false;
diff --git a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
index b497f8d75fde..598678f64cb4 100644
--- a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
@@ -37,39 +37,6 @@ using namespace mlir::affine;
 
 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
 
-static llvm::cl::list<int> clTestVectorShapeRatio(
-    "vector-shape-ratio",
-    llvm::cl::desc("Specify the HW vector size for vectorization"),
-    llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestForwardSlicingAnalysis(
-    "forward-slicing",
-    llvm::cl::desc("Enable testing forward static slicing and topological sort "
-                   "functionalities"),
-    llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestBackwardSlicingAnalysis(
-    "backward-slicing",
-    llvm::cl::desc("Enable testing backward static slicing and "
-                   "topological sort functionalities"),
-    llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestSlicingAnalysis(
-    "slicing",
-    llvm::cl::desc("Enable testing static slicing and topological sort "
-                   "functionalities"),
-    llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestComposeMaps(
-    "compose-maps",
-    llvm::cl::desc(
-        "Enable testing the composition of AffineMap where each "
-        "AffineMap in the composition is specified as the affine_map attribute "
-        "in a constant op."),
-    llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestVecAffineLoopNest(
-    "vectorize-affine-loop-nest",
-    llvm::cl::desc(
-        "Enable testing for the 'vectorizeAffineLoopNest' utility by "
-        "vectorizing the outermost loops found"),
-    llvm::cl::cat(clOptionsCategory));
-
 namespace {
 struct VectorizerTestPass
     : public PassWrapper<VectorizerTestPass, OperationPass<func::FuncOp>> {
@@ -85,6 +52,37 @@ struct VectorizerTestPass
     return "Tests vectorizer standalone functionality.";
   }
 
+  VectorizerTestPass() = default;
+  VectorizerTestPass(const VectorizerTestPass &pass) : PassWrapper(pass){};
+
+  ListOption<int> clTestVectorShapeRatio{
+      *this, "vector-shape-ratio",
+      llvm::cl::desc("Specify the HW vector size for vectorization")};
+  Option<bool> clTestForwardSlicingAnalysis{
+      *this, "forward-slicing",
+      llvm::cl::desc(
+          "Enable testing forward static slicing and topological sort "
+          "functionalities")};
+  Option<bool> clTestBackwardSlicingAnalysis{
+      *this, "backward-slicing",
+      llvm::cl::desc("Enable testing backward static slicing and "
+                     "topological sort functionalities")};
+  Option<bool> clTestSlicingAnalysis{
+      *this, "slicing",
+      llvm::cl::desc("Enable testing static slicing and topological sort "
+                     "functionalities")};
+  Option<bool> clTestComposeMaps{
+      *this, "compose-maps",
+      llvm::cl::desc("Enable testing the composition of AffineMap where each "
+                     "AffineMap in the composition is specified as the "
+                     "affine_map attribute "
+                     "in a constant op.")};
+  Option<bool> clTestVecAffineLoopNest{
+      *this, "vectorize-affine-loop-nest",
+      llvm::cl::desc(
+          "Enable testing for the 'vectorizeAffineLoopNest' utility by "
+          "vectorizing the outermost loops found")};
+
   void runOnOperation() override;
   void testVectorShapeRatio(llvm::raw_ostream &outs);
   void testForwardSlicing(llvm::raw_ostream &outs);
diff --git a/mlir/test/mlir-tblgen/op-properties.td b/mlir/test/mlir-tblgen/op-properties.td
new file mode 100644
index 000000000000..a484f68fc4a1
--- /dev/null
+++ b/mlir/test/mlir-tblgen/op-properties.td
@@ -0,0 +1,21 @@
+// RUN: mlir-tblgen -gen-op-defs -I %S/../../include %s | FileCheck %s
+
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/EnumAttr.td"
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+  let cppNamespace = "foobar";
+}
+class NS_Op<string mnemonic, list<Trait> traits = []> :
+    Op<Test_Dialect, mnemonic, traits>;
+
+def OpWithAttr : NS_Op<"op_with_attr">{
+  let arguments = (ins AnyAttr:$attr, OptionalAttr<AnyAttr>:$optional);
+}
+
+// CHECK: void OpWithAttr::setAttrAttr(::mlir::Attribute attr)
+// CHECK-NEXT: getProperties().attr = attr
+// CHECK: void OpWithAttr::setOptionalAttr(::mlir::Attribute attr)
+// CHECK-NEXT: getProperties().optional = attr
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
index f14e559fff92..fe6ad1504112 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
@@ -1008,7 +1008,7 @@ void {0}::regionBuilder(ImplicitLocOpBuilder &b,
                         Block &block, ArrayRef<NamedAttribute> attrs) {{
   assert({1} > 0 && block.getNumArguments() == {1} &&
          "{0} regionBuilder expects {1} (>=0) args");
-  RegionBuilderHelper helper(block.getArgument(0).getContext(), block);
+  RegionBuilderHelper helper(b, block);
   SmallVector<Value> yields;
   {2}
   {3}
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 3a697520dfad..843760d57c99 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -1804,23 +1804,36 @@ void OpEmitter::genAttrGetters() {
 }
 
 void OpEmitter::genAttrSetters() {
+  bool useProperties = op.getDialect().usePropertiesForAttributes();
+
+  // Generate the code to set an attribute.
+  auto emitSetAttr = [&](Method *method, StringRef getterName,
+                         StringRef attrName, StringRef attrVar) {
+    if (useProperties) {
+      method->body() << formatv("  getProperties().{0} = {1};", attrName,
+                                attrVar);
+    } else {
+      method->body() << formatv("  (*this)->setAttr({0}AttrName(), {1});",
+                                getterName, attrVar);
+    }
+  };
+
   // Generate raw named setter type. This is a wrapper class that allows setting
   // to the attributes via setters instead of having to use the string interface
   // for better compile time verification.
   auto emitAttrWithStorageType = [&](StringRef setterName, StringRef getterName,
-                                     Attribute attr) {
+                                     StringRef attrName, Attribute attr) {
     auto *method =
         opClass.addMethod("void", setterName + "Attr",
                           MethodParameter(attr.getStorageType(), "attr"));
     if (method)
-      method->body() << formatv("  (*this)->setAttr({0}AttrName(), attr);",
-                                getterName);
+      emitSetAttr(method, getterName, attrName, "attr");
   };
 
   // Generate a setter that accepts the underlying C++ type as opposed to the
   // attribute type.
   auto emitAttrWithReturnType = [&](StringRef setterName, StringRef getterName,
-                                    Attribute attr) {
+                                    StringRef attrName, Attribute attr) {
     Attribute baseAttr = attr.getBaseAttr();
     if (!canUseUnwrappedRawValue(baseAttr))
       return;
@@ -1849,9 +1862,8 @@ void OpEmitter::genAttrSetters() {
 
     // If the value isn't optional, just set it directly.
     if (!isOptional) {
-      method->body() << formatv(
-          "  (*this)->setAttr({0}AttrName(), {1});", getterName,
-          constBuildAttrFromParam(attr, fctx, "attrValue"));
+      emitSetAttr(method, getterName, attrName,
+                  constBuildAttrFromParam(attr, fctx, "attrValue"));
       return;
     }
 
@@ -1862,13 +1874,25 @@ void OpEmitter::genAttrSetters() {
     // optional but not in the same way as the others (i.e. it uses bool over
     // std::optional<>).
     StringRef paramStr = isUnitAttr ? "attrValue" : "*attrValue";
-    const char *optionalCodeBody = R"(
+    if (!useProperties) {
+      const char *optionalCodeBody = R"(
     if (attrValue)
       return (*this)->setAttr({0}AttrName(), {1});
     (*this)->removeAttr({0}AttrName());)";
-    method->body() << formatv(
-        optionalCodeBody, getterName,
-        constBuildAttrFromParam(baseAttr, fctx, paramStr));
+      method->body() << formatv(
+          optionalCodeBody, getterName,
+          constBuildAttrFromParam(baseAttr, fctx, paramStr));
+    } else {
+      const char *optionalCodeBody = R"(
+    auto &odsProp = getProperties().{0};
+    if (attrValue)
+      odsProp = {1};
+    else
+      odsProp = nullptr;)";
+      method->body() << formatv(
+          optionalCodeBody, attrName,
+          constBuildAttrFromParam(baseAttr, fctx, paramStr));
+    }
   };
 
   for (const NamedAttribute &namedAttr : op.getAttributes()) {
@@ -1876,8 +1900,10 @@ void OpEmitter::genAttrSetters() {
       continue;
     std::string setterName = op.getSetterName(namedAttr.name);
     std::string getterName = op.getGetterName(namedAttr.name);
-    emitAttrWithStorageType(setterName, getterName, namedAttr.attr);
-    emitAttrWithReturnType(setterName, getterName, namedAttr.attr);
+    emitAttrWithStorageType(setterName, getterName, namedAttr.name,
+                            namedAttr.attr);
+    emitAttrWithReturnType(setterName, getterName, namedAttr.name,
+                           namedAttr.attr);
   }
 }
 
diff --git a/openmp/runtime/test/affinity/kmp-abs-hw-subset.c b/openmp/runtime/test/affinity/kmp-abs-hw-subset.c
index 7b3493f1e5c4..025a239a1b83 100644
--- a/openmp/runtime/test/affinity/kmp-abs-hw-subset.c
+++ b/openmp/runtime/test/affinity/kmp-abs-hw-subset.c
@@ -6,6 +6,12 @@
 // RUN: env OMP_PLACES=threads %libomp-run 3 1
 // RUN: env OMP_PLACES=threads %libomp-run 3 2
 // REQUIRES: linux
+//
+// The test requires topologies with sockets, cores, threads layers where
+// the socket layer contains multiple threads.
+// The s390x architecture does not produce this topology and seems to have
+// one thread per socket.
+// UNSUPPORTED: s390x-target-arch
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/utils/bazel/.bazelrc b/utils/bazel/.bazelrc
index 1d7cf4a4df1b..5a6d1889076a 100644
--- a/utils/bazel/.bazelrc
+++ b/utils/bazel/.bazelrc
@@ -9,10 +9,16 @@
 # Prevent invalid caching if input files are modified during a build.
 build --experimental_guard_against_concurrent_changes
 
+# Automatically enable --config=(linux|macos|windows) based on the host
+build --enable_platform_specific_config
+
 # In opt mode, bazel by default builds both PIC and non-PIC object files for
 # tests vs binaries. We don't need this feature and it slows down opt builds
 # considerably.
-build --force_pic
+# TODO: Remove platform specifics we're on bazel 7.x https://github.com/bazelbuild/bazel/issues/12439
+# Apple platforms always enable pic so this flag is unnecessary anyways
+build:linux --force_pic
+build:windows --force_pic
 
 # Shared objects take up more space. With fast linkers and binaries that aren't
 # super large, the benefits of shared objects are minimal.
@@ -34,6 +40,9 @@ build --incompatible_no_implicit_file_export
 # eventually become the default
 common --incompatible_disallow_empty_glob
 
+# TODO: Remove once we move to bazel 7.x
+build --experimental_cc_shared_library
+
 ###############################################################################
 # Options to select different strategies for linking potential dependent
 # libraries. The default leaves it disabled.
diff --git a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl
index d6cd6aa0813e..717b86d7d6e8 100644
--- a/utils/bazel/configure.bzl
+++ b/utils/bazel/configure.bzl
@@ -4,8 +4,6 @@
 
 """Helper macros to configure the LLVM overlay project."""
 
-load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
-
 # Directory of overlay files relative to WORKSPACE
 DEFAULT_OVERLAY_PATH = "llvm-project-overlay"
 
@@ -77,6 +75,7 @@ def _extract_cmake_settings(repository_ctx, llvm_cmake):
         "LLVM_VERSION_MAJOR": None,
         "LLVM_VERSION_MINOR": None,
         "LLVM_VERSION_PATCH": None,
+        "LLVM_VERSION_SUFFIX": None,
     }
 
     # It would be easier to use external commands like sed(1) and python.
@@ -126,6 +125,13 @@ def _extract_cmake_settings(repository_ctx, llvm_cmake):
         c["LLVM_VERSION_PATCH"],
     )
 
+    c["PACKAGE_VERSION"] = "{}.{}.{}{}".format(
+        c["LLVM_VERSION_MAJOR"],
+        c["LLVM_VERSION_MINOR"],
+        c["LLVM_VERSION_PATCH"],
+        c["LLVM_VERSION_SUFFIX"],
+    )
+
     return c
 
 def _write_dict_to_file(repository_ctx, filepath, header, vars):
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 9dfe4c48184e..b8cd290a41a2 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -68,7 +68,6 @@ libc_support_library(
     name = "llvm_libc_macros_math_macros",
     hdrs = ["include/llvm-libc-macros/math-macros.h"],
     deps = [":llvm_libc_macros_limits_macros"],
-    defines = ["__FP_LOGBNAN_MIN"],
 )
 
 libc_support_library(
@@ -1000,8 +999,8 @@ libc_support_library(
 
 libc_support_library(
     name = "__support_osutil_quick_exit",
-    hdrs = ["src/__support/OSUtil/quick_exit.h"],
     srcs = ["src/__support/OSUtil/linux/quick_exit.cpp"],
+    hdrs = ["src/__support/OSUtil/quick_exit.h"],
     deps = [
         ":__support_osutil_syscall",
     ],
@@ -1696,16 +1695,14 @@ libc_math_function(
 )
 
 libc_math_function(name = "fabs")
-
 libc_math_function(name = "fabsf")
-
 libc_math_function(name = "fabsl")
+libc_math_function(name = "fabsf128")
 
 libc_math_function(name = "fdim")
-
 libc_math_function(name = "fdimf")
-
 libc_math_function(name = "fdiml")
+libc_math_function(name = "fdimf128")
 
 libc_math_function(
     name = "ceil",
@@ -1730,6 +1727,9 @@ libc_math_function(
     ],
 )
 
+libc_math_function(name = "ceilf128")
+
+
 libc_math_function(
     name = "floor",
     specializations = [
@@ -1747,12 +1747,12 @@ libc_math_function(
 )
 
 libc_math_function(name = "floorl")
+libc_math_function(name = "floorf128")
 
 libc_math_function(name = "ldexp")
-
 libc_math_function(name = "ldexpf")
-
 libc_math_function(name = "ldexpl")
+libc_math_function(name = "ldexpf128")
 
 libc_math_function(
     name = "trunc",
@@ -1771,6 +1771,7 @@ libc_math_function(
 )
 
 libc_math_function(name = "truncl")
+libc_math_function(name = "truncf128")
 
 libc_math_function(
     name = "round",
@@ -1789,6 +1790,7 @@ libc_math_function(
 )
 
 libc_math_function(name = "roundl")
+libc_math_function(name = "roundf128")
 
 libc_math_function(
     name = "fmod",
@@ -1805,10 +1807,9 @@ libc_math_function(
 )
 
 libc_math_function(name = "frexp")
-
 libc_math_function(name = "frexpf")
-
 libc_math_function(name = "frexpl")
+libc_math_function(name = "frexpf128")
 
 libc_math_function(name = "hypot")
 
@@ -1820,40 +1821,32 @@ libc_math_function(
 )
 
 libc_math_function(name = "logb")
-
 libc_math_function(name = "logbf")
-
 libc_math_function(name = "logbl")
+libc_math_function(name = "logbf128")
 
 libc_math_function(name = "modf")
-
 libc_math_function(name = "modff")
-
 libc_math_function(name = "modfl")
+libc_math_function(name = "modff128")
 
 libc_math_function(name = "remquo")
-
 libc_math_function(name = "remquof")
-
 libc_math_function(name = "remquol")
 
 libc_math_function(name = "remainder")
-
 libc_math_function(name = "remainderf")
-
 libc_math_function(name = "remainderl")
 
 libc_math_function(name = "fmin")
-
 libc_math_function(name = "fminf")
-
 libc_math_function(name = "fminl")
+libc_math_function(name = "fminf128")
 
 libc_math_function(name = "fmax")
-
 libc_math_function(name = "fmaxf")
-
 libc_math_function(name = "fmaxl")
+libc_math_function(name = "fmaxf128")
 
 libc_math_function(
     name = "cosf",
@@ -1927,49 +1920,47 @@ libc_math_function(
     ],
 )
 
-libc_math_function(name = "copysign")
+libc_math_function(
+    name = "sqrtf128",
+    additional_deps = [
+        ":__support_fputil_sqrt",
+    ],
+)
 
+libc_math_function(name = "copysign")
 libc_math_function(name = "copysignf")
-
 libc_math_function(name = "copysignl")
-
 libc_math_function(name = "copysignf128")
 
 libc_math_function(name = "ilogb")
-
 libc_math_function(name = "ilogbf")
-
 libc_math_function(name = "ilogbl")
+libc_math_function(name = "ilogbf128")
 
 libc_math_function(name = "rint")
-
 libc_math_function(name = "rintf")
-
 libc_math_function(name = "rintl")
+libc_math_function(name = "rintf128")
 
 libc_math_function(name = "lrint")
-
 libc_math_function(name = "lrintf")
-
 libc_math_function(name = "lrintl")
+libc_math_function(name = "lrintf128")
 
 libc_math_function(name = "llrint")
-
 libc_math_function(name = "llrintf")
-
 libc_math_function(name = "llrintl")
+libc_math_function(name = "llrintf128")
 
 libc_math_function(name = "lround")
-
 libc_math_function(name = "lroundf")
-
 libc_math_function(name = "lroundl")
+libc_math_function(name = "lroundf128")
 
 libc_math_function(name = "llround")
-
 libc_math_function(name = "llroundf")
-
 libc_math_function(name = "llroundl")
+libc_math_function(name = "llroundf128")
 
 libc_math_function(
     name = "nan",
@@ -1995,28 +1986,29 @@ libc_math_function(
     ],
 )
 
-libc_math_function(name = "nearbyint")
+libc_math_function(
+    name = "nanf128",
+    additional_deps = [
+        ":__support_str_to_float",
+        ":errno",
+    ],
+)
 
+libc_math_function(name = "nearbyint")
 libc_math_function(name = "nearbyintf")
-
 libc_math_function(name = "nearbyintl")
 
 libc_math_function(name = "nextafter")
-
 libc_math_function(name = "nextafterf")
-
 libc_math_function(name = "nextafterl")
+libc_math_function(name = "nextafterf128")
 
 libc_math_function(name = "nexttoward")
-
 libc_math_function(name = "nexttowardf")
-
 libc_math_function(name = "nexttowardl")
 
 libc_math_function(name = "scalbn")
-
 libc_math_function(name = "scalbnf")
-
 libc_math_function(name = "scalbnl")
 
 ############################## inttypes targets ##############################
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
index 4f976122967c..c0d402a89ea3 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
@@ -87,6 +87,7 @@ libc_test(
     srcs = ["uint_test.cpp"],
     deps = [
         "//libc:__support_cpp_optional",
+        "//libc:__support_integer_literals",
         "//libc:__support_macros_properties_types",
         "//libc:__support_uint",
         "//libc:llvm_libc_macros_math_macros",
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel
new file mode 100644
index 000000000000..0d69a480cb28
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel
@@ -0,0 +1,147 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Smoke tests for LLVM libc math.h functions.
+
+load("//libc:libc_build_rules.bzl", "libc_support_library")
+load("//libc/test/src/math:libc_math_test_rules.bzl", "math_test")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+math_test(
+    name = "fabsf128",
+    hdrs = ["FAbsTest.h"],
+)
+
+math_test(
+    name = "ceilf128",
+    hdrs = ["CeilTest.h"],
+)
+
+math_test(
+    name = "floorf128",
+    hdrs = ["FloorTest.h"],
+)
+
+math_test(
+    name = "truncf128",
+    hdrs = ["TruncTest.h"],
+)
+
+math_test(
+    name = "roundf128",
+    hdrs = ["RoundTest.h"],
+)
+
+math_test(
+    name = "frexpf128",
+    hdrs = ["FrexpTest.h"],
+)
+
+math_test(
+    name = "logbf128",
+    hdrs = ["LogbTest.h"],
+)
+
+math_test(
+    name = "modff128",
+    hdrs = ["ModfTest.h"],
+)
+
+math_test(
+    name = "fminf128",
+    hdrs = ["FMinTest.h"],
+)
+
+math_test(
+    name = "fmaxf128",
+    hdrs = ["FMaxTest.h"],
+)
+
+math_test(
+    name = "sqrtf128",
+    hdrs = ["SqrtTest.h"],
+    deps = ["//libc:__support_cpp_bit"],
+)
+
+math_test(
+    name = "copysignf128",
+    hdrs = ["CopySignTest.h"],
+)
+
+math_test(
+    name = "ilogbf128",
+    hdrs = ["ILogbTest.h"],
+    deps = ["//libc:__support_cpp_limits"],
+)
+
+math_test(
+    name = "fdimf128",
+    hdrs = ["FDimTest.h"],
+)
+
+libc_support_library(
+    name = "ldexp_test_template",
+    hdrs = ["LdExpTest.h"],
+    deps = [
+        "//libc:__support_cpp_limits",
+        "//libc:__support_fputil_fp_bits",
+        "//libc:__support_fputil_normal_float",
+        "//libc:llvm_libc_macros_math_macros",
+        "//libc/test/UnitTest:LibcUnitTest",
+        "//libc/test/UnitTest:fp_test_helpers",
+    ],
+)
+
+math_test(
+    name = "ldexpf128",
+    hdrs = ["LdExpTest.h"],
+    deps = ["//libc:__support_cpp_limits"],
+)
+
+math_test(
+    name = "rintf128",
+    hdrs = ["RIntTest.h"],
+)
+
+math_test(
+    name = "lrintf128",
+    hdrs = ["RoundToIntegerTest.h"],
+)
+
+math_test(
+    name = "llrintf128",
+    hdrs = ["RoundToIntegerTest.h"],
+)
+math_test(
+    name = "lroundf128",
+    hdrs = ["RoundToIntegerTest.h"],
+)
+
+math_test(
+    name = "llroundf128",
+    hdrs = ["RoundToIntegerTest.h"],
+)
+
+libc_support_library(
+    name = "nextafter_test_template",
+    hdrs = ["NextAfterTest.h"],
+    deps = [
+        "//libc:__support_cpp_array",
+        "//libc:__support_cpp_bit",
+        "//libc:__support_cpp_type_traits",
+        "//libc:__support_fputil_basic_operations",
+        "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
+        "//libc/test/UnitTest:LibcUnitTest",
+        "//libc/test/UnitTest:fp_test_helpers",
+    ],
+)
+
+math_test(
+    name = "nextafterf128",
+    deps = [":nextafter_test_template"],
+)
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
new file mode 100644
index 000000000000..300f2798f7aa
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -0,0 +1,848 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("//:vars.bzl", "LLVM_VERSION_MAJOR", "LLVM_VERSION_MINOR", "LLVM_VERSION_PATCH", "LLVM_VERSION_SUFFIX", "PACKAGE_VERSION")
+load("//lldb/source/Plugins:plugin_config.bzl", "DEFAULT_PLUGINS", "DEFAULT_SCRIPT_PLUGINS", "OBJCPP_COPTS")
+load("//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE.TXT"])
+
+bool_flag(
+    name = "enable_curses",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "curses_enabled_setting",
+    flag_values = {":enable_curses": "true"},
+)
+
+selects.config_setting_group(
+    name = "curses_enabled",
+    match_any = [
+        ":curses_enabled_setting",
+        "@platforms//os:macos",
+    ],
+)
+
+bool_flag(
+    name = "enable_libedit",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "libedit_enabled_setting",
+    flag_values = {":enable_libedit": "true"},
+)
+
+selects.config_setting_group(
+    name = "libedit_enabled",
+    match_any = [
+        ":libedit_enabled_setting",
+        "@platforms//os:macos",
+    ],
+)
+
+_VERSION_SUBSTITUTIONS = {
+    "@LLDB_VERSION@": PACKAGE_VERSION,
+    "@LLDB_VERSION_MAJOR@": LLVM_VERSION_MAJOR,
+    "@LLDB_VERSION_MINOR@": LLVM_VERSION_MINOR,
+    "@LLDB_VERSION_PATCH@": LLVM_VERSION_PATCH,
+    "@LLDB_VERSION_SUFFIX@": LLVM_VERSION_SUFFIX,
+    '#cmakedefine LLDB_FULL_VERSION_STRING "@LLDB_FULL_VERSION_STRING@"': "/* #undef LLDB_FULL_VERSION_STRING */",
+}
+
+genrule(
+    name = "vcs_version_gen",
+    outs = ["VCSVersion.inc"],
+    cmd = "echo '#undef LLDB_REVISION' >> $@\n" +
+          "echo '#undef LLDB_REPOSITORY' >> $@\n",
+)
+
+expand_template(
+    name = "version_inc_gen",
+    out = "Version/Version.inc",
+    substitutions = _VERSION_SUBSTITUTIONS,
+    template = "include/lldb/Version/Version.inc.in",
+)
+
+cc_library(
+    name = "Version",
+    srcs = [
+        "source/Version/Version.cpp",
+        ":vcs_version_gen",
+        ":version_inc_gen",
+    ],
+    hdrs = ["include/lldb/Version/Version.h"],
+    features = ["-layering_check"],  # Version.inc breaks this unintentionally
+    strip_include_prefix = "include",
+    deps = ["//clang:basic"],
+)
+
+expand_template(
+    name = "ConfigHeader",
+    out = "include/lldb/Host/Config.h",
+    substitutions = {
+        "#cmakedefine01 HAVE_PTSNAME_R": "#define HAVE_PTSNAME_R 1",
+        "#cmakedefine01 LLDB_ENABLE_TERMIOS": "#define LLDB_ENABLE_TERMIOS 1",
+
+        # TODO: Add LZMA support by including the library in bazel
+        "#cmakedefine01 LLDB_ENABLE_LZMA": "#define LLDB_ENABLE_LZMA 0",
+
+        # TODO: lua support
+        "#cmakedefine01 LLDB_ENABLE_LUA": "#define LLDB_ENABLE_LUA 0",
+
+        # TODO: python support
+        "#cmakedefine01 LLDB_ENABLE_PYTHON": "#define LLDB_ENABLE_PYTHON 0",
+        # Only enabled by default on Windows
+        "#cmakedefine01 LLDB_EMBED_PYTHON_HOME": "#define LLDB_EMBED_PYTHON_HOME 0",
+        # Only used if LLDB_EMBED_PYTHON_HOME is true
+        "#cmakedefine LLDB_PYTHON_HOME R\"(${LLDB_PYTHON_HOME})\"": "#define LLDB_PYTHON_HOME \"\"",
+
+        # Unsupported
+        "#cmakedefine01 CURSES_HAVE_NCURSES_CURSES_H": "#define CURSES_HAVE_NCURSES_CURSES_H 0",
+        "#cmakedefine01 LLDB_ENABLE_FBSDVMCORE": "#define LLDB_ENABLE_FBSDVMCORE 0",
+
+        # Defaults that could be configurable if needed
+        "#cmakedefine01 LLDB_ENABLE_POSIX": "#define LLDB_ENABLE_POSIX 1",
+        "#cmakedefine LLDB_GLOBAL_INIT_DIRECTORY R\"(${LLDB_GLOBAL_INIT_DIRECTORY})\"": "#define LLDB_GLOBAL_INIT_DIRECTORY \"\"",
+        "${LLDB_INSTALL_LIBDIR_BASENAME}": "lib",
+        "${LLDB_BUG_REPORT_URL}": "",
+    } | select({
+        "@platforms//os:macos": {
+            "#cmakedefine HAVE_LIBCOMPRESSION": "#define HAVE_LIBCOMPRESSION",
+            "#cmakedefine01 HAVE_NR_PROCESS_VM_READV": "#define HAVE_NR_PROCESS_VM_READV 0",
+            "#cmakedefine01 HAVE_PPOLL": "#define HAVE_PPOLL 0",
+            "#cmakedefine01 HAVE_PROCESS_VM_READV": "#define HAVE_PROCESS_VM_READV 0",
+            "#cmakedefine01 HAVE_SYS_EVENT_H": "#define HAVE_SYS_EVENT_H 1",
+            "#cmakedefine01 LLDB_ENABLE_LIBXML2": "#define LLDB_ENABLE_LIBXML2 1",
+            "#cmakedefine01 LLDB_HAVE_EL_RFUNC_T": "#define LLDB_HAVE_EL_RFUNC_T 0",
+        },
+        "@platforms//os:linux": {
+            "#cmakedefine HAVE_LIBCOMPRESSION": "/* #undef HAVE_LIBCOMPRESSION */",
+            "#cmakedefine01 HAVE_NR_PROCESS_VM_READV": "#define HAVE_NR_PROCESS_VM_READV 1",
+            "#cmakedefine01 HAVE_PPOLL": "#define HAVE_PPOLL 1",
+            "#cmakedefine01 HAVE_PROCESS_VM_READV": "#define HAVE_PROCESS_VM_READV 1",
+            "#cmakedefine01 HAVE_SYS_EVENT_H": "#define HAVE_SYS_EVENT_H 0",
+            "#cmakedefine01 LLDB_ENABLE_LIBXML2": "#define LLDB_ENABLE_LIBXML2 0",
+            "#cmakedefine01 LLDB_HAVE_EL_RFUNC_T": "#define LLDB_HAVE_EL_RFUNC_T 1",
+        },
+    }) | select({
+        ":curses_enabled": {
+            "#cmakedefine01 LLDB_ENABLE_CURSES": "#define LLDB_ENABLE_CURSES 1",
+        },
+        "//conditions:default": {
+            "#cmakedefine01 LLDB_ENABLE_CURSES": "#define LLDB_ENABLE_CURSES 0",
+        },
+    }) | select({
+        ":libedit_enabled": {
+            "#cmakedefine01 LLDB_EDITLINE_USE_WCHAR": "#define LLDB_EDITLINE_USE_WCHAR 1",
+            "#cmakedefine01 LLDB_ENABLE_LIBEDIT": "#define LLDB_ENABLE_LIBEDIT 1",
+        },
+        "//conditions:default": {
+            "#cmakedefine01 LLDB_EDITLINE_USE_WCHAR": "#define LLDB_EDITLINE_USE_WCHAR 0",
+            "#cmakedefine01 LLDB_ENABLE_LIBEDIT": "#define LLDB_ENABLE_LIBEDIT 0",
+        },
+    }),
+    template = "include/lldb/Host/Config.h.cmake",
+)
+
+cc_library(
+    name = "Config",
+    hdrs = [":ConfigHeader"],
+    include_prefix = "lldb/Host",
+)
+
+cc_binary(
+    name = "lldb-tblgen",
+    srcs = glob([
+        "utils/TableGen/*.cpp",
+        "utils/TableGen/*.h",
+    ]),
+    deps = [
+        "//llvm:CodeGenTypes",
+        "//llvm:Support",
+        "//llvm:TableGen",
+        "//llvm:TargetParser",
+        "//llvm:config",
+    ],
+)
+
+cc_library(
+    name = "API",
+    srcs = glob([
+        "source/API/**/*.cpp",
+        "source/API/**/*.h",
+    ]),
+    hdrs = glob(["include/lldb/API/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = [
+        ":Breakpoint",
+        ":Commands",
+        ":Core",
+        ":DataFormatters",
+        ":Expression",
+        ":Headers",
+        ":Host",
+        ":Initialization",
+        ":InterpreterHeaders",
+        ":Symbol",
+        ":SymbolHeaders",
+        ":Target",
+        ":TargetHeaders",
+        ":Utility",
+        ":Version",
+        "//lldb/source/Plugins:PluginExpressionParserClangHeaders",
+        "//lldb/source/Plugins:PluginsConfig",
+        "//llvm:ExecutionEngine",
+        "//llvm:MCJIT",
+        "//llvm:Support",
+        "//llvm:config",
+    ],
+)
+
+cc_library(
+    name = "Breakpoint",
+    srcs = glob(["source/Breakpoint/**/*.cpp"]),
+    hdrs = glob(["include/lldb/Breakpoint/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = [
+        ":Core",
+        ":DataFormattersHeaders",
+        ":Expression",
+        ":Headers",
+        ":InterpreterHeaders",
+        ":SymbolHeaders",
+        ":TargetHeaders",
+        ":Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "DataFormatters",
+    srcs = glob(["source/DataFormatters/**/*.cpp"]),
+    hdrs = glob(["include/lldb/DataFormatters/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = [
+        ":CoreHeaders",
+        ":Headers",
+        ":InterpreterHeaders",
+        ":SymbolHeaders",
+        ":TargetHeaders",
+        ":Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "Expression",
+    srcs = glob(["source/Expression/**/*.cpp"]),
+    hdrs = glob(["include/lldb/Expression/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = [
+        ":Core",
+        ":Headers",
+        ":Host",
+        ":InterpreterHeaders",
+        ":SymbolHeaders",
+        ":TargetHeaders",
+        ":Utility",
+        "//lldb/source/Plugins:PluginSymbolFileDWARFHeaders",
+        "//llvm:Core",
+        "//llvm:DebugInfoDWARF",
+        "//llvm:ExecutionEngine",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "Initialization",
+    srcs = glob(["source/Initialization/**/*.cpp"]),
+    hdrs = glob(["include/lldb/Initialization/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = [
+        ":Core",
+        ":Headers",
+        ":Host",
+        ":TargetHeaders",
+        ":Utility",
+        ":Version",
+        "//lldb/source/Plugins:PluginProcessGDBRemote",
+        "//lldb/source/Plugins:PluginProcessPOSIX",
+        "//llvm:Support",
+    ],
+)
+
+gentbl_cc_library(
+    name = "InterpreterProperties",
+    strip_include_prefix = "source/Interpreter",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "source/Interpreter/InterpreterProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "source/Interpreter/InterpreterPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = ":lldb-tblgen",
+    td_file = "source/Interpreter/InterpreterProperties.td",
+    deps = [":CoreTdFiles"],
+)
+
+cc_library(
+    name = "APIHeaders",
+    hdrs = glob(["include/lldb/API/**/*.h"]),
+    strip_include_prefix = "include",
+)
+
+cc_library(
+    name = "InterpreterHeaders",
+    hdrs = glob(["include/lldb/Interpreter/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = [":APIHeaders"],
+)
+
+cc_library(
+    name = "BreakpointHeaders",
+    hdrs = glob(["include/lldb/Breakpoint/**/*.h"]),
+    strip_include_prefix = "include",
+)
+
+cc_library(
+    name = "ExpressionHeaders",
+    hdrs = glob(["include/lldb/Expression/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = ["//llvm:ExecutionEngine"],
+)
+
+cc_library(
+    name = "DataFormattersHeaders",
+    hdrs = glob(["include/lldb/DataFormatters/**/*.h"]),
+    strip_include_prefix = "include",
+)
+
+cc_library(
+    name = "Interpreter",
+    srcs = glob(["source/Interpreter/**/*.cpp"]),
+    deps = [
+        ":API",
+        ":Commands",
+        ":Core",
+        ":DataFormatters",
+        ":Headers",
+        ":Host",
+        ":InterpreterHeaders",
+        ":InterpreterProperties",
+        ":SymbolHeaders",
+        ":TargetHeaders",
+        ":Utility",
+        "//llvm:Support",
+    ],
+)
+
+td_library(
+    name = "CommandsTdFiles",
+    srcs = glob(["source/Commands/**/*.td"]),
+)
+
+gentbl_cc_library(
+    name = "CommandOptions",
+    strip_include_prefix = "source/Commands",
+    tbl_outs = [
+        (
+            ["-gen-lldb-option-defs"],
+            "source/Commands/CommandOptions.inc",
+        ),
+    ],
+    tblgen = ":lldb-tblgen",
+    td_file = "source/Commands/Options.td",
+    deps = [":CommandsTdFiles"],
+)
+
+cc_library(
+    name = "Commands",
+    srcs = glob(["source/Commands/**/*.cpp"]),
+    hdrs = glob(["source/Commands/**/*.h"]),
+    strip_include_prefix = "source",
+    deps = [
+        ":Breakpoint",
+        ":CommandOptions",
+        ":Core",
+        ":DataFormatters",
+        ":Expression",
+        ":Headers",
+        ":Host",
+        ":InterpreterHeaders",
+        ":SymbolHeaders",
+        ":Target",
+        ":TargetHeaders",
+        ":Utility",
+        ":Version",
+        "//clang:codegen",
+        "//clang:frontend",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "SymbolHeaders",
+    hdrs = glob(["include/lldb/Symbol/**/*.h"]),
+    strip_include_prefix = "include",
+)
+
+cc_library(
+    name = "Symbol",
+    srcs = glob(["source/Symbol/**/*.cpp"]),
+    deps = [
+        ":Core",
+        ":Expression",
+        ":Headers",
+        ":Host",
+        ":SymbolHeaders",
+        ":TargetHeaders",
+        ":Utility",
+        ":UtilityPrivateHeaders",
+        "//llvm:DebugInfo",
+        "//llvm:DebugInfoDWARF",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "HostMacOSXHeaders",
+    hdrs = glob([
+        "include/lldb/Host/*.h",
+        "include/lldb/Host/macosx/*.h",
+        "include/lldb/Host/posix/*.h",
+    ]),
+    strip_include_prefix = "include",
+    deps = [":Utility"],
+)
+
+cc_library(
+    name = "HostMacOSXPrivateHeaders",
+    hdrs = glob([
+        "source/Host/macosx/cfcpp/*.h",
+        "source/Host/macosx/objcxx/*.h",
+    ]),
+    strip_include_prefix = "source",
+    target_compatible_with = select({
+        "@platforms//os:macos": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [":Utility"],
+)
+
+objc_library(
+    name = "HostMacOSXObjCXX",
+    srcs = glob([
+        "source/Host/macosx/objcxx/*.mm",
+    ]),
+    copts = OBJCPP_COPTS,
+    target_compatible_with = select({
+        "@platforms//os:macos": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":HostMacOSXHeaders",
+        ":HostMacOSXPrivateHeaders",
+    ],
+)
+
+cc_library(
+    name = "Host",
+    srcs = glob([
+        "source/Host/common/**/*.cpp",
+    ]) + select({
+        "@platforms//os:linux": glob(
+            [
+                "source/Host/posix/**/*.cpp",
+                "source/Host/linux/**/*.cpp",
+            ],
+            exclude = ["source/Host/linux/android/**/*.cpp"],
+        ),
+        "@platforms//os:macos": glob(
+            [
+                "source/Host/macosx/cfcpp/*.cpp",
+                "source/Host/posix/**/*.cpp",
+            ],
+        ),
+    }),
+    hdrs = [":ConfigHeader"] + glob([
+        "include/lldb/Host/*.h",
+        "include/lldb/Host/common/*.h",
+    ]) + select({
+        "@platforms//os:macos": glob([
+            "include/lldb/Host/macosx/*.h",
+            "include/lldb/Host/posix/*.h",
+        ]),
+        "@platforms//os:linux": glob([
+            "include/lldb/Host/linux/*.h",
+            "include/lldb/Host/posix/*.h",
+        ]),
+    }),
+    # TODO: Move this to Config library when https://github.com/bazelbuild/bazel/issues/21884 is fixed
+    linkopts = select({
+        "@platforms//os:macos": [
+            "-lcompression",
+            "-lxml2",
+            "-Wl,-framework,CoreServices",
+            "-Wl,-framework,Security",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        ":curses_enabled": [
+            "-lcurses",
+            "-lpanel",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        ":libedit_enabled": [
+            "-ledit",
+        ],
+        "//conditions:default": [],
+    }),
+    strip_include_prefix = "include",
+    deps = [
+        ":Headers",
+        ":Utility",
+        "//llvm:Object",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+        "//llvm:config",
+    ] + select({
+        "@platforms//os:macos": [":HostMacOSXObjCXX"],
+        "//conditions:default": [],
+    }),
+)
+
+td_library(
+    name = "CoreTdFiles",
+    srcs = glob([
+        "source/Core/**/*.td",
+        "include/lldb/Core/*.td",
+    ]),
+)
+
+gentbl_cc_library(
+    name = "CoreProperties",
+    strip_include_prefix = "source/Core",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "source/Core/CoreProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "source/Core/CorePropertiesEnum.inc",
+        ),
+    ],
+    tblgen = ":lldb-tblgen",
+    td_file = "source/Core/CoreProperties.td",
+    deps = [":CoreTdFiles"],
+)
+
+cc_library(
+    name = "CoreHeaders",
+    hdrs = glob(["include/lldb/Core/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = [
+        ":BreakpointHeaders",
+        ":CoreProperties",
+        ":DataFormattersHeaders",
+        ":ExpressionHeaders",
+        ":Host",
+        ":InterpreterHeaders",
+        ":SymbolHeaders",
+        ":TargetHeaders",
+        ":Utility",
+        "//clang:driver",
+        "//llvm:Demangle",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "Core",
+    srcs = glob(["source/Core/**/*.cpp"]),
+    hdrs = glob(["include/lldb/Core/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = [
+        ":BreakpointHeaders",
+        ":CoreHeaders",
+        ":CoreProperties",
+        ":DataFormattersHeaders",
+        ":ExpressionHeaders",
+        ":Headers",
+        ":Host",
+        ":InterpreterHeaders",
+        ":SymbolHeaders",
+        ":TargetHeaders",
+        ":Utility",
+        "//clang:driver",
+        "//lldb/source/Plugins:PluginCPlusPlusLanguageHeaders",
+        "//lldb/source/Plugins:PluginObjCLanguageHeaders",
+        "//llvm:Demangle",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+gentbl_cc_library(
+    name = "TargetProperties",
+    strip_include_prefix = "source/Target",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "source/Target/TargetProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "source/Target/TargetPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = ":lldb-tblgen",
+    td_file = "source/Target/TargetProperties.td",
+    deps = [":CoreTdFiles"],
+)
+
+cc_library(
+    name = "AppleArm64ExceptionClass",
+    hdrs = ["include/lldb/Target/AppleArm64ExceptionClass.def"],
+    strip_include_prefix = "include/lldb/Target",
+)
+
+cc_library(
+    name = "TargetHeaders",
+    hdrs = glob(["include/lldb/Target/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = [":AppleArm64ExceptionClass"],
+)
+
+cc_library(
+    name = "Target",
+    srcs = glob(["source/Target/**/*.cpp"]),
+    deps = [
+        ":BreakpointHeaders",
+        ":Core",
+        ":DataFormattersHeaders",
+        ":ExpressionHeaders",
+        ":Headers",
+        ":Host",
+        ":InterpreterHeaders",
+        ":Symbol",
+        ":SymbolHeaders",
+        ":TargetHeaders",
+        ":TargetProperties",
+        ":Utility",
+        "//lldb/source/Plugins:PluginProcessUtility",
+        "//llvm:MC",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "Headers",
+    hdrs = glob(["include/lldb/lldb-*.h"]),
+    strip_include_prefix = "include",
+)
+
+cc_library(
+    name = "UtilityPrivateHeaders",
+    hdrs = glob(["source/Utility/**/*.h"]),
+    strip_include_prefix = "source",
+    deps = [":Headers"],
+)
+
+cc_library(
+    name = "Utility",
+    srcs = glob(["source/Utility/**/*.cpp"]),
+    hdrs = glob(["include/lldb/Utility/**/*.h"]),
+    strip_include_prefix = "include",
+    deps = [
+        ":Headers",
+        ":UtilityPrivateHeaders",
+        "//llvm:BinaryFormat",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+        "//llvm:config",
+    ],
+)
+
+cc_library(
+    name = "liblldb.static",
+    deps = [
+        ":API",
+        ":Host",
+        ":Interpreter",
+        "//llvm:AllTargetsDisassemblers",
+    ] + [
+        "//lldb/source/Plugins:Plugin{}".format(x)
+        for x in DEFAULT_PLUGINS + DEFAULT_SCRIPT_PLUGINS
+    ] + select({
+        "@platforms//os:macos": [
+            "//lldb/source/Plugins:PluginProcessMacOSXKernel",
+            "//lldb/source/Plugins:PluginSymbolLocatorDebugSymbols",
+            "//lldb/source/Plugins:PluginSymbolVendorMacOSX",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+cc_shared_library(
+    name = "liblldb",
+    # TODO: Remove once fixed https://github.com/bazelbuild/bazel/issues/21893
+    additional_linker_inputs = select({
+        "@platforms//os:macos": [
+            ":HostMacOSXObjCXX",
+            "//lldb/source/Plugins:PluginPlatformMacOSXObjCXX",
+        ],
+        "//conditions:default": [],
+    }),
+    shared_lib_name = select({
+        "@platforms//os:macos": "liblldb{}.dylib".format(PACKAGE_VERSION),
+        "@platforms//os:linux": "liblldb{}.so".format(PACKAGE_VERSION),
+    }),
+    # TODO: Remove once fixed https://github.com/bazelbuild/bazel/issues/21893
+    user_link_flags = select({
+        "@platforms//os:macos": [
+            "$(location :HostMacOSXObjCXX)",
+            "$(location //lldb/source/Plugins:PluginPlatformMacOSXObjCXX)",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [":liblldb.static"],
+)
+
+gentbl_cc_library(
+    name = "lldb_options_inc_gen",
+    strip_include_prefix = ".",
+    tbl_outs = [(
+        ["-gen-opt-parser-defs"],
+        "Options.inc",
+    )],
+    tblgen = "//llvm:llvm-tblgen",
+    td_file = "tools/driver/Options.td",
+    deps = ["//llvm:OptParserTdFiles"],
+)
+
+cc_binary(
+    name = "lldb",
+    srcs = glob([
+        "tools/driver/*.cpp",
+        "tools/driver/*.h",
+    ]),
+    data = [
+        ":lldb-argdumper",
+    ] + select({
+        "@platforms//os:macos": [":debugserver"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":APIHeaders",
+        ":Host",
+        ":liblldb.static",
+        ":lldb_options_inc_gen",
+        "//llvm:Option",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "DebugServerCommonMacOSXHeaders",
+    hdrs = glob(["tools/debugserver/source/MacOSX/**/*.h"]),
+    strip_include_prefix = "tools/debugserver/source/MacOSX",
+)
+
+cc_library(
+    name = "DebugServerCommonHeaders",
+    hdrs = glob(["tools/debugserver/source/**/*.h"]),
+    strip_include_prefix = "tools/debugserver/source",
+    deps = [":DebugServerCommonMacOSXHeaders"],
+)
+
+objc_library(
+    name = "DebugServerMacOSX",
+    srcs = glob(["tools/debugserver/source/MacOSX/*.mm"]),
+    copts = OBJCPP_COPTS,
+    target_compatible_with = select({
+        "@platforms//os:macos": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":DebugServerCommonHeaders",
+        ":DebugServerCommonMacOSXHeaders",
+    ],
+)
+
+cc_library(
+    name = "DebugServerCommon",
+    srcs = glob(
+        ["tools/debugserver/source/**/*.cpp"],
+        exclude = ["tools/debugserver/source/debugserver.cpp"],
+    ),
+    local_defines = ["LLDB_USE_OS_LOG"],
+    deps = [
+        ":DebugServerCommonHeaders",
+        ":DebugServerCommonMacOSXHeaders",
+        ":DebugServerMacOSX",
+        ":Host",
+    ],
+)
+
+genrule(
+    name = "mach_gen",
+    srcs = ["tools/debugserver/source/MacOSX/dbgnub-mig.defs"],
+    outs = [
+        "mach_exc.h",
+        "mach_excServer.c",
+        "mach_excUser.c",
+    ],
+    cmd = "mig -header $(location :mach_exc.h) -server $(location :mach_excServer.c) -user $(location :mach_excUser.c) $(SRCS)",
+    target_compatible_with = select({
+        "@platforms//os:macos": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+)
+
+expand_template(
+    name = "debugserver_version_gen",
+    out = "debugserver_vers.c",
+    substitutions = _VERSION_SUBSTITUTIONS,
+    template = "tools/debugserver/source/debugserver_vers.c.in",
+)
+
+cc_binary(
+    name = "debugserver",
+    srcs = [
+        "tools/debugserver/source/debugserver.cpp",
+        ":debugserver_version_gen",
+        ":mach_gen",
+    ],
+    target_compatible_with = select({
+        "@platforms//os:macos": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [":DebugServerCommon"],
+)
+
+cc_binary(
+    name = "lldb-argdumper",
+    srcs = glob(["tools/argdumper/*.cpp"]),
+    deps = ["//llvm:Support"],
+)
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
new file mode 100644
index 000000000000..95773ed653b8
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -0,0 +1,2319 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("//mlir:tblgen.bzl", "gentbl_cc_library")
+load(":plugin_config.bzl", "DEFAULT_PLUGINS", "DEFAULT_SCRIPT_PLUGINS", "OBJCPP_COPTS")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+)
+
+licenses(["notice"])
+
+cc_library(
+    name = "PluginClangCommon",
+    srcs = glob(["Language/ClangCommon/*.cpp"]),
+    hdrs = glob(["Language/ClangCommon/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//clang:basic",
+        "//clang:lex",
+        "//lldb:CoreHeaders",
+        "//lldb:Host",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginObjCLanguageHeaders",
+    hdrs = glob(["Language/ObjC/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginClangCommon",
+        ":PluginExpressionParserClangHeaders",
+        "//lldb:CoreHeaders",
+    ],
+)
+
+cc_library(
+    name = "PluginObjCLanguage",
+    srcs = glob(["Language/ObjC/*.cpp"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginAppleObjCRuntime",
+        ":PluginExpressionParserClangHeaders",
+        ":PluginObjCLanguageHeaders",
+        ":PluginObjCRuntime",
+        ":PluginTypeSystemClangHeaders",
+        "//clang:ast",
+        "//clang:basic",
+        "//lldb:CoreHeaders",
+        "//lldb:DataFormattersHeaders",
+        "//lldb:ExpressionHeaders",
+        "//lldb:Host",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginTypeSystemClangHeaders",
+    hdrs = glob(["TypeSystem/Clang/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginExpressionParserClangHeaders",
+        "//clang:frontend",
+        "//lldb:CoreHeaders",
+    ],
+)
+
+cc_library(
+    name = "PluginCPPRuntime",
+    srcs = glob(["LanguageRuntime/CPlusPlus/*.cpp"]),
+    hdrs = glob(["LanguageRuntime/CPlusPlus/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:CoreHeaders",
+        "//lldb:Headers",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginObjCRuntime",
+    srcs = glob(["LanguageRuntime/ObjC/*.cpp"]),
+    hdrs = glob(["LanguageRuntime/ObjC/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginTypeSystemClangHeaders",
+        "//clang:ast",
+        "//lldb:BreakpointHeaders",
+        "//lldb:CoreHeaders",
+        "//lldb:Headers",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginAppleObjCRuntime",
+    srcs = glob(["LanguageRuntime/ObjC/AppleObjCRuntime/*.cpp"]),
+    hdrs = glob(["LanguageRuntime/ObjC/AppleObjCRuntime/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginCPPRuntime",
+        ":PluginExpressionParserClangHeaders",
+        ":PluginObjCLanguageHeaders",
+        ":PluginObjCRuntime",
+        ":PluginProcessUtility",
+        ":PluginTypeSystemClangHeaders",
+        "//clang:ast",
+        "//clang:basic",
+        "//lldb:BreakpointHeaders",
+        "//lldb:CoreHeaders",
+        "//lldb:DataFormattersHeaders",
+        "//lldb:ExpressionHeaders",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:InterpreterHeaders",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginTypeSystemClang",
+    srcs = glob(["TypeSystem/Clang/*.cpp"]),
+    deps = [
+        ":PluginExpressionParserClangHeaders",
+        ":PluginObjCRuntime",
+        ":PluginSymbolFileDWARF",
+        ":PluginSymbolFileDWARFHeaders",
+        ":PluginSymbolFileNativePDBHeaders",
+        ":PluginSymbolFilePDBHeaders",
+        ":PluginTypeSystemClangHeaders",
+        "//clang:ast",
+        "//clang:basic",
+        "//clang:frontend",
+        "//clang:lex",
+        "//clang:sema",
+        "//lldb:CoreHeaders",
+        "//lldb:Host",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginExpressionParserClangHeaders",
+    hdrs = glob(["ExpressionParser/Clang/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:CoreHeaders",
+        "//lldb:DataFormattersHeaders",
+    ],
+)
+
+cc_library(
+    name = "PluginExpressionParserClang",
+    srcs = glob(["ExpressionParser/Clang/*.cpp"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginCPPRuntime",
+        ":PluginCPlusPlusLanguageHeaders",
+        ":PluginExpressionParserClangHeaders",
+        ":PluginObjCRuntime",
+        ":PluginTypeSystemClang",
+        ":PluginTypeSystemClangHeaders",
+        "//clang:ast",
+        "//clang:basic",
+        "//clang:codegen",
+        "//clang:config",
+        "//clang:driver",
+        "//clang:edit",
+        "//clang:frontend",
+        "//clang:frontend_rewrite",
+        "//clang:lex",
+        "//clang:parse",
+        "//clang:rewrite",
+        "//clang:sema",
+        "//clang:serialization",
+        "//lldb:Core",
+        "//lldb:DataFormatters",
+        "//lldb:ExpressionHeaders",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:InterpreterHeaders",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Core",
+        "//llvm:ExecutionEngine",
+        "//llvm:IPO",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+gentbl_cc_library(
+    name = "PlatformMacOSXProperties",
+    strip_include_prefix = "Platform/MacOSX",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "Platform/MacOSX/PlatformMacOSXProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "Platform/MacOSX/PlatformMacOSXPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "Platform/MacOSX/PlatformMacOSXProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginPlatformMacOSXObjCXXHeaders",
+    hdrs = glob(["Platform/MacOSX/objcxx/*.h"]),
+    include_prefix = "Plugins",
+    target_compatible_with = select({
+        "@platforms//os:macos": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = ["//lldb:Host"],
+)
+
+objc_library(
+    name = "PluginPlatformMacOSXObjCXX",
+    srcs = glob(["Platform/MacOSX/objcxx/*.mm"]),
+    copts = OBJCPP_COPTS,
+    target_compatible_with = select({
+        "@platforms//os:macos": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [":PluginPlatformMacOSXObjCXXHeaders"],
+)
+
+cc_library(
+    name = "PluginPlatformMacOSX",
+    srcs = glob(
+               ["Platform/MacOSX/*.cpp"],
+               exclude = ["Platform/MacOSX/PlatformAppleSimulator.cpp"],
+           ) +
+           select({
+               "@platforms//os:macos": ["Platform/MacOSX/PlatformAppleSimulator.cpp"],
+               "//conditions:default": [],
+           }),
+    hdrs = glob(["Platform/MacOSX/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PlatformMacOSXProperties",
+        ":PluginDynamicLoaderDarwinKernelHeaders",
+        ":PluginObjectContainerMachOFileset",
+        ":PluginPlatformPOSIX",
+        "//clang:driver_options_inc_gen",
+        "//lldb:BreakpointHeaders",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:InterpreterHeaders",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ] + select({
+        "@platforms//os:macos": [":PluginPlatformMacOSXObjCXX"],
+        "//conditions:default": [],
+    }),
+)
+
+gentbl_cc_library(
+    name = "SymbolFileDWARFProperties",
+    strip_include_prefix = "SymbolFile/DWARF",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "SymbolFile/DWARF/SymbolFileDWARFProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "SymbolFile/DWARF/SymbolFileDWARFPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "SymbolFile/DWARF/SymbolFileDWARFProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginSymbolFileDWARFHeaders",
+    hdrs = glob(["SymbolFile/DWARF/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginTypeSystemClangHeaders",
+        "//lldb:Core",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolFileDWARF",
+    srcs = glob(["SymbolFile/DWARF/*.cpp"]),
+    deps = [
+        ":PluginCPlusPlusLanguageHeaders",
+        ":PluginExpressionParserClangHeaders",
+        ":PluginObjCLanguageHeaders",
+        ":PluginSymbolFileDWARFHeaders",
+        ":PluginTypeSystemClangHeaders",
+        ":SymbolFileDWARFProperties",
+        "//clang:ast",
+        "//lldb:Core",
+        "//lldb:ExpressionHeaders",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:InterpreterHeaders",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:DebugInfoDWARF",
+        "//llvm:Demangle",
+        "//llvm:Object",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginProcessUtility",
+    srcs = glob(["Process/Utility/*.cpp"]),
+    hdrs = glob(["Process/Utility/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:BreakpointHeaders",
+        "//lldb:Core",
+        "//lldb:ExpressionHeaders",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//lldb:UtilityPrivateHeaders",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectFilePDB",
+    srcs = glob(["ObjectFile/PDB/*.cpp"]),
+    hdrs = glob(["ObjectFile/PDB/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:BinaryFormat",
+        "//llvm:DebugInfoPDB",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolFileNativePDBHeaders",
+    hdrs = glob(["SymbolFile/NativePDB/*.h"]),
+    include_prefix = "Plugins",
+    deps = ["//lldb:Core"],
+)
+
+cc_library(
+    name = "PluginSymbolFileNativePDB",
+    srcs = glob(["SymbolFile/NativePDB/*.cpp"]),
+    deps = [
+        ":PluginCPlusPlusLanguageHeaders",
+        ":PluginExpressionParserClangHeaders",
+        ":PluginObjectFilePDB",
+        ":PluginProcessUtility",
+        ":PluginSymbolFileNativePDBHeaders",
+        ":PluginSymbolFilePDBHeaders",
+        ":PluginTypeSystemClangHeaders",
+        "//lldb:Core",
+        "//lldb:ExpressionHeaders",
+        "//lldb:Headers",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:BinaryFormat",
+        "//llvm:DebugInfoCodeView",
+        "//llvm:DebugInfoMSF",
+        "//llvm:DebugInfoPDB",
+        "//llvm:Demangle",
+        "//llvm:Object",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolFilePDBHeaders",
+    hdrs = glob(["SymbolFile/PDB/*.h"]),
+    include_prefix = "Plugins",
+    deps = ["//lldb:Core"],
+)
+
+cc_library(
+    name = "PluginSymbolFilePDB",
+    srcs = glob(["SymbolFile/PDB/*.cpp"]),
+    deps = [
+        ":PluginCPlusPlusLanguageHeaders",
+        ":PluginExpressionParserClangHeaders",
+        ":PluginSymbolFileNativePDB",
+        ":PluginSymbolFileNativePDBHeaders",
+        ":PluginSymbolFilePDBHeaders",
+        ":PluginTypeSystemClangHeaders",
+        "//clang:ast",
+        "//clang:lex",
+        "//lldb:Core",
+        "//lldb:ExpressionHeaders",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:DebugInfoCodeView",
+        "//llvm:DebugInfoPDB",
+    ],
+)
+
+gentbl_cc_library(
+    name = "ProcessGDBRemoteProperties",
+    strip_include_prefix = "Process/gdb-remote",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "Process/gdb-remote/ProcessGDBRemoteProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "Process/gdb-remote/ProcessGDBRemotePropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "Process/gdb-remote/ProcessGDBRemoteProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginProcessGDBRemote",
+    srcs = glob(["Process/gdb-remote/*.cpp"]),
+    hdrs = glob(["Process/gdb-remote/*.h"]) + [
+        "Process/gdb-remote/GDBRemoteErrno.def",
+    ],
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        ":ProcessGDBRemoteProperties",
+        "//lldb:BreakpointHeaders",
+        "//lldb:CoreHeaders",
+        "//lldb:DataFormattersHeaders",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:InterpreterHeaders",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//lldb:UtilityPrivateHeaders",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+        "@llvm_zlib//:zlib",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectContainerMachOArchive",
+    srcs = glob(["ObjectContainer/Universal-Mach-O/*.cpp"]),
+    hdrs = glob(["ObjectContainer/Universal-Mach-O/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectContainerBSDArchive",
+    srcs = glob(["ObjectContainer/BSD-Archive/*.cpp"]),
+    hdrs = glob(["ObjectContainer/BSD-Archive/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:Object",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectContainerMachOFileset",
+    srcs = glob(["ObjectContainer/Mach-O-Fileset/*.cpp"]),
+    hdrs = glob(["ObjectContainer/Mach-O-Fileset/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+gentbl_cc_library(
+    name = "StructuredDataDarwinLogProperties",
+    strip_include_prefix = "StructuredData/DarwinLog",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "StructuredData/DarwinLog/StructuredDataDarwinLogProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "StructuredData/DarwinLog/StructuredDataDarwinLogPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "StructuredData/DarwinLog/StructuredDataDarwinLogProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginStructuredDataDarwinLog",
+    srcs = glob(["StructuredData/DarwinLog/*.cpp"]),
+    hdrs = glob(["StructuredData/DarwinLog/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":StructuredDataDarwinLogProperties",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginTraceCommon",
+    srcs = glob(["Trace/common/*.cpp"]),
+    hdrs = glob(["Trace/common/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+    ],
+)
+
+cc_library(
+    name = "PluginPlatformPOSIX",
+    srcs = glob(["Platform/POSIX/*.cpp"]),
+    hdrs = glob(["Platform/POSIX/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginPlatformGDB",
+        ":PluginTypeSystemClang",
+        ":PluginTypeSystemClangHeaders",
+        "//lldb:Core",
+        "//lldb:Expression",
+        "//lldb:Host",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+gentbl_cc_library(
+    name = "PlatformQemuUserProperties",
+    strip_include_prefix = "Platform/QemuUser",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "Platform/QemuUser/PlatformQemuUserProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "Platform/QemuUser/PlatformQemuUserPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "Platform/QemuUser/PlatformQemuUserProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginPlatformQemuUser",
+    srcs = glob(["Platform/QemuUser/*.cpp"]),
+    hdrs = glob(["Platform/QemuUser/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PlatformQemuUserProperties",
+        ":PluginProcessGDBRemote",
+        "//lldb:CoreHeaders",
+        "//lldb:Host",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginPlatformGDB",
+    srcs = glob(["Platform/gdb-server/*.cpp"]),
+    hdrs = glob(["Platform/gdb-server/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessGDBRemote",
+        ":PluginProcessUtility",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginPlatformLinux",
+    srcs = glob(["Platform/Linux/*.cpp"]),
+    hdrs = glob(["Platform/Linux/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginPlatformPOSIX",
+        ":PluginTypeSystemClangHeaders",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Interpreter",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//lldb:UtilityPrivateHeaders",
+    ],
+)
+
+gentbl_cc_library(
+    name = "PlatformAndroidProperties",
+    strip_include_prefix = "Platform/Android",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "Platform/Android/PlatformAndroidProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "Platform/Android/PlatformAndroidPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "Platform/Android/PlatformAndroidProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginPlatformAndroid",
+    srcs = glob(["Platform/Android/*.cpp"]),
+    hdrs = glob(["Platform/Android/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PlatformAndroidProperties",
+        ":PluginPlatformGDB",
+        ":PluginPlatformLinux",
+        ":PluginPlatformPOSIX",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginPlatformWindows",
+    srcs = glob(["Platform/Windows/*.cpp"]),
+    hdrs = glob(["Platform/Windows/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginPlatformGDB",
+        ":PluginTypeSystemClangHeaders",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:ExpressionHeaders",
+        "//lldb:Host",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginMemoryHistoryASan",
+    srcs = glob(["MemoryHistory/asan/*.cpp"]),
+    hdrs = glob(["MemoryHistory/asan/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Expression",
+        "//lldb:Headers",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+    ],
+)
+
+cc_library(
+    name = "PluginClangREPL",
+    srcs = glob(["REPL/Clang/*.cpp"]),
+    hdrs = glob(["REPL/Clang/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginCPPRuntime",
+        ":PluginClangCommon",
+        ":PluginTypeSystemClang",
+        "//lldb:Core",
+        "//lldb:DataFormatters",
+        "//lldb:ExpressionHeaders",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:Target",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolVendorWasm",
+    srcs = glob(["SymbolVendor/wasm/*.cpp"]),
+    hdrs = glob(["SymbolVendor/wasm/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjectFileWasm",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolVendorMacOSX",
+    srcs = glob(["SymbolVendor/MacOSX/*.cpp"]),
+    hdrs = glob(["SymbolVendor/MacOSX/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjectFileMachO",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolVendorPECOFF",
+    srcs = glob(["SymbolVendor/PECOFF/*.cpp"]),
+    hdrs = glob(["SymbolVendor/PECOFF/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjectFilePECOFF",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolVendorELF",
+    srcs = glob(["SymbolVendor/ELF/*.cpp"]),
+    hdrs = glob(["SymbolVendor/ELF/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjectFileELF",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginObjCPlusPlusLanguage",
+    srcs = glob(["Language/ObjCPlusPlus/*.cpp"]),
+    hdrs = glob(["Language/ObjCPlusPlus/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginClangCommon",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginCPlusPlusLanguageHeaders",
+    hdrs = glob(["Language/CPlusPlus/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginClangCommon",
+        ":PluginExpressionParserClangHeaders",
+        "//lldb:CoreHeaders",
+    ],
+)
+
+cc_library(
+    name = "PluginCPlusPlusLanguage",
+    srcs = glob(["Language/CPlusPlus/*.cpp"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginCPPRuntime",
+        ":PluginCPlusPlusLanguageHeaders",
+        ":PluginClangCommon",
+        ":PluginExpressionParserClangHeaders",
+        ":PluginTypeSystemClang",
+        ":PluginTypeSystemClangHeaders",
+        "//clang:basic",
+        "//lldb:Core",
+        "//lldb:DataFormatters",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Demangle",
+        "//llvm:Support",
+    ],
+)
+
+gentbl_cc_library(
+    name = "TraceExporterCTFOptions",
+    strip_include_prefix = "TraceExporter/ctf",
+    tbl_outs = [(
+        ["-gen-lldb-option-defs"],
+        "TraceExporter/ctf/TraceExporterCTFCommandOptions.inc",
+    )],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "TraceExporter/ctf/TraceExporterCTFOptions.td",
+    deps = [
+        "//lldb:CommandsTdFiles",
+        "//lldb:CoreTdFiles",
+    ],
+)
+
+cc_library(
+    name = "PluginTraceExporterCTF",
+    srcs = glob(["TraceExporter/ctf/*.cpp"]),
+    hdrs = glob(["TraceExporter/ctf/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginTraceExporterCommon",
+        ":TraceExporterCTFOptions",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+    ],
+)
+
+cc_library(
+    name = "PluginTraceExporterCommon",
+    srcs = glob(["TraceExporter/common/*.cpp"]),
+    hdrs = glob(["TraceExporter/common/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginABIPowerPC",
+    srcs = glob(["ABI/PowerPC/*.cpp"]),
+    hdrs = glob(["ABI/PowerPC/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        ":PluginTypeSystemClang",
+        ":PluginTypeSystemClangHeaders",
+        "//clang:ast",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//lldb:UtilityPrivateHeaders",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginABIHexagon",
+    srcs = glob(["ABI/Hexagon/*.cpp"]),
+    hdrs = glob(["ABI/Hexagon/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Core",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginABIMips",
+    srcs = glob(["ABI/Mips/*.cpp"]),
+    hdrs = glob(["ABI/Mips/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginABIMSP430",
+    srcs = glob(["ABI/MSP430/*.cpp"]),
+    hdrs = glob(["ABI/MSP430/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Core",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginABIX86",
+    srcs = glob(["ABI/X86/*.cpp"]),
+    hdrs = glob(["ABI/X86/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginABIARM",
+    srcs = glob(["ABI/ARM/*.cpp"]),
+    hdrs = glob(["ABI/ARM/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//lldb:UtilityPrivateHeaders",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginABIARC",
+    srcs = glob(["ABI/ARC/*.cpp"]),
+    hdrs = glob(["ABI/ARC/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Core",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginABIRISCV",
+    srcs = glob(["ABI/RISCV/*.cpp"]),
+    hdrs = glob(["ABI/RISCV/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Core",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginABISystemZ",
+    srcs = glob(["ABI/SystemZ/*.cpp"]),
+    hdrs = glob(["ABI/SystemZ/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginABIAArch64",
+    srcs = glob(["ABI/AArch64/*.cpp"]),
+    hdrs = glob(["ABI/AArch64/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//lldb:UtilityPrivateHeaders",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginDynamicLoaderPosixDYLDHeaders",
+    hdrs = glob(["DynamicLoader/POSIX-DYLD/*.h"]),
+    include_prefix = "Plugins",
+)
+
+cc_library(
+    name = "PluginDynamicLoaderPosixDYLD",
+    srcs = glob(["DynamicLoader/POSIX-DYLD/*.cpp"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginDynamicLoaderPosixDYLDHeaders",
+        ":PluginProcessElfCore",
+        ":PluginProcessUtility",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginDynamicLoaderWindowsDYLD",
+    srcs = glob(["DynamicLoader/Windows-DYLD/*.cpp"]),
+    hdrs = glob(["DynamicLoader/Windows-DYLD/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginDynamicLoaderHexagonDYLD",
+    srcs = glob(["DynamicLoader/Hexagon-DYLD/*.cpp"]),
+    hdrs = glob(["DynamicLoader/Hexagon-DYLD/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginDynamicLoaderWasmDYLD",
+    srcs = glob(["DynamicLoader/wasm-DYLD/*.cpp"]),
+    hdrs = glob(["DynamicLoader/wasm-DYLD/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjectFileWasm",
+        "//lldb:Core",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginDynamicLoaderStatic",
+    srcs = glob(["DynamicLoader/Static/*.cpp"]),
+    hdrs = glob(["DynamicLoader/Static/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginDynamicLoaderMacOSXDYLD",
+    srcs = glob(["DynamicLoader/MacOSX-DYLD/*.cpp"]),
+    hdrs = glob(["DynamicLoader/MacOSX-DYLD/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjCRuntime",
+        ":PluginTypeSystemClang",
+        ":PluginTypeSystemClangHeaders",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Expression",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:TargetParser",
+    ],
+)
+
+gentbl_cc_library(
+    name = "DynamicLoaderDarwinKernelProperties",
+    strip_include_prefix = "DynamicLoader/Darwin-Kernel",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernelProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernelPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernelProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginDynamicLoaderDarwinKernelHeaders",
+    hdrs = glob(["DynamicLoader/Darwin-Kernel/*.h"]),
+    include_prefix = "Plugins",
+)
+
+cc_library(
+    name = "PluginDynamicLoaderDarwinKernel",
+    srcs = glob(["DynamicLoader/Darwin-Kernel/*.cpp"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":DynamicLoaderDarwinKernelProperties",
+        ":PluginDynamicLoaderDarwinKernelHeaders",
+        ":PluginObjectFileMachO",
+        ":PluginPlatformMacOSX",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginArchitecturePPC64",
+    srcs = glob(["Architecture/PPC64/*.cpp"]),
+    hdrs = glob(["Architecture/PPC64/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:BinaryFormat",
+    ],
+)
+
+cc_library(
+    name = "PluginArchitectureMips",
+    srcs = glob(["Architecture/Mips/*.cpp"]),
+    hdrs = glob(["Architecture/Mips/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginArchitectureArm",
+    srcs = glob(["Architecture/Arm/*.cpp"]),
+    hdrs = glob(["Architecture/Arm/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginArchitectureAArch64",
+    srcs = glob(["Architecture/AArch64/*.cpp"]),
+    hdrs = glob(["Architecture/AArch64/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginDisassemblerLLVMC",
+    srcs = glob(["Disassembler/LLVMC/*.cpp"]),
+    hdrs = glob(["Disassembler/LLVMC/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:MC",
+        "//llvm:MCDisassembler",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolFileSymtab",
+    srcs = glob(["SymbolFile/Symtab/*.cpp"]),
+    hdrs = glob(["SymbolFile/Symtab/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolFileCTF",
+    srcs = glob(["SymbolFile/CTF/*.cpp"]),
+    hdrs = glob(["SymbolFile/CTF/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginExpressionParserClangHeaders",
+        ":PluginTypeSystemClangHeaders",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+        "@llvm_zlib//:zlib",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolFileJSON",
+    srcs = glob(["SymbolFile/JSON/*.cpp"]),
+    hdrs = glob(["SymbolFile/JSON/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjectFileJSON",
+        "//lldb:Core",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolFileBreakpad",
+    srcs = glob(["SymbolFile/Breakpad/*.cpp"]),
+    hdrs = glob(["SymbolFile/Breakpad/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjectFileBreakpad",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginInstructionPPC64",
+    srcs = glob(["Instruction/PPC64/*.cpp"]),
+    hdrs = glob(["Instruction/PPC64/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginInstructionLoongArch",
+    srcs = glob(["Instruction/LoongArch/*.cpp"]),
+    hdrs = glob(["Instruction/LoongArch/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginInstructionMIPS",
+    srcs = glob(["Instruction/MIPS/*.cpp"]),
+    hdrs = glob(["Instruction/MIPS/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:MC",
+        "//llvm:MCDisassembler",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginInstructionMIPS64",
+    srcs = glob(["Instruction/MIPS64/*.cpp"]),
+    hdrs = glob(["Instruction/MIPS64/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:MC",
+        "//llvm:MCDisassembler",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginInstructionARM",
+    srcs = glob(["Instruction/ARM/*.cpp"]),
+    hdrs = glob(["Instruction/ARM/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//lldb:UtilityPrivateHeaders",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginInstructionRISCV",
+    srcs = glob(["Instruction/RISCV/*.cpp"]),
+    hdrs = glob(["Instruction/RISCV/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginInstructionARM64",
+    srcs = glob(["Instruction/ARM64/*.cpp"]),
+    hdrs = glob(["Instruction/ARM64/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginInstrumentationRuntimeASanLibsanitizers",
+    srcs = glob(["InstrumentationRuntime/ASanLibsanitizers/*.cpp"]),
+    hdrs = glob(["InstrumentationRuntime/ASanLibsanitizers/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginInstrumentationRuntimeUtility",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginInstrumentationRuntimeTSan",
+    srcs = glob(["InstrumentationRuntime/TSan/*.cpp"]),
+    hdrs = glob(["InstrumentationRuntime/TSan/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Expression",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginInstrumentationRuntimeASan",
+    srcs = glob(["InstrumentationRuntime/ASan/*.cpp"]),
+    hdrs = glob(["InstrumentationRuntime/ASan/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginInstrumentationRuntimeUtility",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginInstrumentationRuntimeMainThreadChecker",
+    srcs = glob(["InstrumentationRuntime/MainThreadChecker/*.cpp"]),
+    hdrs = glob(["InstrumentationRuntime/MainThreadChecker/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Expression",
+        "//lldb:Headers",
+        "//lldb:Interpreter",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginInstrumentationRuntimeUBSan",
+    srcs = glob(["InstrumentationRuntime/UBSan/*.cpp"]),
+    hdrs = glob(["InstrumentationRuntime/UBSan/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Expression",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginInstrumentationRuntimeUtility",
+    srcs = glob(["InstrumentationRuntime/Utility/*.cpp"]),
+    hdrs = glob(["InstrumentationRuntime/Utility/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Expression",
+        "//lldb:Symbol",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+    ],
+)
+
+gentbl_cc_library(
+    name = "JITLoaderGDBProperties",
+    strip_include_prefix = "JITLoader/GDB",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "JITLoader/GDB/JITLoaderGDBProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "JITLoader/GDB/JITLoaderGDBPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "JITLoader/GDB/JITLoaderGDBProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginJITLoaderGDB",
+    srcs = glob(["JITLoader/GDB/*.cpp"]),
+    hdrs = glob(["JITLoader/GDB/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":JITLoaderGDBProperties",
+        ":PluginObjectFileMachO",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolLocatorDefault",
+    srcs = glob(["SymbolLocator/Default/*.cpp"]),
+    hdrs = glob(["SymbolLocator/Default/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjectFileWasm",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+gentbl_cc_library(
+    name = "SymbolLocatorDebuginfodProperties",
+    strip_include_prefix = "SymbolLocator/Debuginfod",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "SymbolLocator/Debuginfod/SymbolLocatorDebuginfodProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "SymbolLocator/Debuginfod/SymbolLocatorDebuginfodPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "SymbolLocator/Debuginfod/SymbolLocatorDebuginfodProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginSymbolLocatorDebuginfod",
+    srcs = glob(["SymbolLocator/Debuginfod/*.cpp"]),
+    hdrs = glob(["SymbolLocator/Debuginfod/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":SymbolLocatorDebuginfodProperties",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:Debuginfod",
+    ],
+)
+
+cc_library(
+    name = "PluginSymbolLocatorDebugSymbols",
+    srcs = glob(["SymbolLocator/DebugSymbols/*.cpp"]),
+    hdrs = glob(["SymbolLocator/DebugSymbols/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjectFileWasm",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:HostMacOSXPrivateHeaders",
+        "//lldb:Symbol",
+    ],
+)
+
+# TODO: python support
+# cc_library(
+#     name = "PluginOperatingSystemPython",
+#     srcs = glob(["OperatingSystem/Python/*.cpp"]),
+#     hdrs = glob(["OperatingSystem/Python/*.h"]),
+#     include_prefix = "Plugins",
+#     deps = [
+#         "//lldb:Core",
+#         "//lldb:Interpreter",
+#         ":PluginProcessUtility",
+#         "//lldb:Symbol",
+#         "//lldb:Target",
+#     ],
+# )
+# cc_library(
+#     name = "PluginScriptInterpreterPythonInterfaces",
+#     srcs = glob(["ScriptInterpreter/Python/Interfaces/*.cpp"]),
+#     hdrs = glob(["ScriptInterpreter/Python/Interfaces/*.h"]),
+#     include_prefix = "Plugins",
+#     deps = [
+#         "//lldb:Core",
+#         "//lldb:Host",
+#         "//lldb:Interpreter",
+#         "//lldb:Target",
+#         "@rules_python//python/cc:current_py_cc_headers",
+#         "@rules_python//python/cc:current_py_cc_libs",
+#     ],
+# )
+# cc_library(
+#     name = "PluginScriptInterpreterPythonHeaders",
+#     hdrs = glob(["ScriptInterpreter/Python/*.h"]),
+#     include_prefix = "Plugins",
+#     deps = [
+#         "//lldb:Host",
+#     ],
+# )
+# cc_library(
+#     name = "PluginScriptInterpreterPython",
+#     srcs = glob(["ScriptInterpreter/Python/*.cpp"]),
+#     local_defines = [
+#         'LLDB_PYTHON_EXE_RELATIVE_PATH=\\"bin/python3\\"',
+#         # Must be kept in sync with WORKSPACE python version
+#         'LLDB_PYTHON_RELATIVE_LIBDIR=\\"lib/python3.11/site-packages\\"',
+#     ],
+#     include_prefix = "Plugins",
+#     deps = [
+#         "//lldb:Breakpoint",
+#         "//lldb:Core",
+#         "//lldb:DataFormatters",
+#         "//lldb:Host",
+#         "//lldb:Interpreter",
+#         ":PluginScriptInterpreterPythonHeaders",
+#         ":PluginScriptInterpreterPythonInterfaces",
+#         "//lldb:Target",
+#     ],
+# )
+
+# TODO: lua support
+# cc_library(
+#     name = "PluginScriptInterpreterLua",
+#     srcs = glob(["ScriptInterpreter/Lua/*.cpp"]),
+#     hdrs = glob(["ScriptInterpreter/Lua/*.h"]),
+#     include_prefix = "Plugins",
+#     deps = [
+#         "//lldb:Core",
+#         "//lldb:Interpreter",
+#     ],
+# )
+
+cc_library(
+    name = "PluginScriptInterpreterNone",
+    srcs = glob(["ScriptInterpreter/None/*.cpp"]),
+    hdrs = glob(["ScriptInterpreter/None/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginSystemRuntimeMacOSX",
+    srcs = glob(["SystemRuntime/MacOSX/*.cpp"]),
+    hdrs = glob(["SystemRuntime/MacOSX/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        ":PluginTypeSystemClang",
+        ":PluginTypeSystemClangHeaders",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Expression",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectFileCOFF",
+    srcs = glob(["ObjectFile/COFF/*.cpp"]),
+    hdrs = glob(["ObjectFile/COFF/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:Utility",
+        "//llvm:Object",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectFileWasm",
+    srcs = glob(["ObjectFile/wasm/*.cpp"]),
+    hdrs = glob(["ObjectFile/wasm/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:BinaryFormat",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectFileJSON",
+    srcs = glob(["ObjectFile/JSON/*.cpp"]),
+    hdrs = glob(["ObjectFile/JSON/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectFilePlaceholder",
+    srcs = glob(["ObjectFile/Placeholder/*.cpp"]),
+    hdrs = glob(["ObjectFile/Placeholder/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectFileMachO",
+    srcs = glob(["ObjectFile/Mach-O/*.cpp"]),
+    hdrs = glob(["ObjectFile/Mach-O/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectFileMinidump",
+    srcs = glob(["ObjectFile/Minidump/*.cpp"]),
+    hdrs = glob(["ObjectFile/Minidump/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessMinidump",
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:BinaryFormat",
+        "//llvm:Object",
+        "//llvm:Support",
+    ],
+)
+
+gentbl_cc_library(
+    name = "ObjectFilePECOFFProperties",
+    strip_include_prefix = "ObjectFile/PECOFF",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "ObjectFile/PECOFF/ObjectFilePECOFFProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "ObjectFile/PECOFF/ObjectFilePECOFFPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "ObjectFile/PECOFF/ObjectFilePECOFFProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginObjectFilePECOFF",
+    srcs = glob(["ObjectFile/PECOFF/*.cpp"]),
+    hdrs = glob(["ObjectFile/PECOFF/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":ObjectFilePECOFFProperties",
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:BinaryFormat",
+        "//llvm:Object",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectFileBreakpad",
+    srcs = glob(["ObjectFile/Breakpad/*.cpp"]),
+    hdrs = glob(["ObjectFile/Breakpad/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
+cc_library(
+    name = "PluginObjectFileELF",
+    srcs = glob(["ObjectFile/ELF/*.cpp"]),
+    hdrs = glob(["ObjectFile/ELF/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:BinaryFormat",
+        "//llvm:Object",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginUnwindAssemblyX86",
+    srcs = glob(["UnwindAssembly/x86/*.cpp"]),
+    hdrs = glob(["UnwindAssembly/x86/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:MCDisassembler",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginUnwindAssemblyInstEmulation",
+    srcs = glob(["UnwindAssembly/InstEmulation/*.cpp"]),
+    hdrs = glob(["UnwindAssembly/InstEmulation/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginProcessPOSIX",
+    srcs = glob(["Process/POSIX/*.cpp"]),
+    hdrs = glob(["Process/POSIX/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:BinaryFormat",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginScriptedProcess",
+    srcs = glob(["Process/scripted/*.cpp"]),
+    hdrs = glob(["Process/scripted/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginProcessMachCore",
+    srcs = glob(["Process/mach-core/*.cpp"]),
+    hdrs = glob(["Process/mach-core/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginDynamicLoaderDarwinKernelHeaders",
+        ":PluginDynamicLoaderMacOSXDYLD",
+        ":PluginDynamicLoaderStatic",
+        ":PluginObjectFileMachO",
+        ":PluginPlatformMacOSX",
+        ":PluginProcessUtility",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Host",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginProcessElfCore",
+    srcs = glob(["Process/elf-core/*.cpp"]),
+    hdrs = glob(["Process/elf-core/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginDynamicLoaderPosixDYLDHeaders",
+        ":PluginObjectFileELF",
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:BinaryFormat",
+        "//llvm:Support",
+    ],
+)
+
+gentbl_cc_library(
+    name = "ProcessKDPProperties",
+    strip_include_prefix = "Process/MacOSX-Kernel",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "Process/MacOSX-Kernel/ProcessKDPProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "Process/MacOSX-Kernel/ProcessKDPPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "Process/MacOSX-Kernel/ProcessKDPProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
+cc_library(
+    name = "PluginProcessMacOSXKernel",
+    srcs = glob(["Process/MacOSX-Kernel/*.cpp"]),
+    hdrs = glob(["Process/MacOSX-Kernel/*.h"]),
+    include_prefix = "Plugins",
+    target_compatible_with = select({
+        "@platforms//os:macos": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":PluginDynamicLoaderDarwinKernel",
+        ":PluginDynamicLoaderDarwinKernelHeaders",
+        ":PluginDynamicLoaderStatic",
+        ":PluginProcessUtility",
+        ":ProcessKDPProperties",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginProcessMinidump",
+    srcs = glob(["Process/minidump/*.cpp"]),
+    hdrs = glob(["Process/minidump/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginObjectFilePlaceholder",
+        ":PluginProcessElfCore",
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//lldb:UtilityPrivateHeaders",
+        "//llvm:BinaryFormat",
+        "//llvm:Object",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginCXXItaniumABI",
+    srcs = glob(["LanguageRuntime/CPlusPlus/ItaniumABI/*.cpp"]),
+    hdrs = glob(["LanguageRuntime/CPlusPlus/ItaniumABI/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginCPPRuntime",
+        ":PluginTypeSystemClang",
+        ":PluginTypeSystemClangHeaders",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:DataFormattersHeaders",
+        "//lldb:ExpressionHeaders",
+        "//lldb:Headers",
+        "//lldb:Interpreter",
+        "//lldb:InterpreterHeaders",
+        "//lldb:Symbol",
+        "//lldb:SymbolHeaders",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+    ],
+)
+
+cc_library(
+    name = "PluginGNUstepObjCRuntime",
+    srcs = glob(["LanguageRuntime/ObjC/GNUstepObjCRuntime/*.cpp"]),
+    hdrs = glob(["LanguageRuntime/ObjC/GNUstepObjCRuntime/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginExpressionParserClang",
+        ":PluginObjCRuntime",
+        ":PluginTypeSystemClang",
+        ":PluginTypeSystemClangHeaders",
+        "//lldb:Breakpoint",
+        "//lldb:Core",
+        "//lldb:Expression",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:Interpreter",
+        "//lldb:Symbol",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "PluginRegisterTypeBuilderClang",
+    srcs = glob(["RegisterTypeBuilder/*.cpp"]),
+    hdrs = glob(["RegisterTypeBuilder/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginTypeSystemClangHeaders",
+        "//clang:ast",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+    ],
+)
+
+_DEFAULT_LOAD_PLUGINS = "\n".join(["LLDB_PLUGIN({})".format(x) for x in DEFAULT_PLUGINS]) + \
+                        "\n" + "\n".join(["LLDB_SCRIPT_PLUGIN({})".format(x) for x in DEFAULT_SCRIPT_PLUGINS])
+
+expand_template(
+    name = "plugins_config_gen",
+    out = "Plugins.def",
+    substitutions = {
+        "@LLDB_PROCESS_WINDOWS_PLUGIN@": "",
+        "@LLDB_PROCESS_GDB_PLUGIN@": "LLDB_PLUGIN(ProcessGDBRemote)",
+    } | select({
+        "@platforms//os:macos": {
+            "@LLDB_ENUM_PLUGINS@": _DEFAULT_LOAD_PLUGINS + """
+LLDB_PLUGIN(ProcessMacOSXKernel)
+LLDB_PLUGIN(SymbolLocatorDebugSymbols)
+LLDB_PLUGIN(SymbolVendorMacOSX)
+""",
+        },
+        "//conditions:default": {
+            "@LLDB_ENUM_PLUGINS@": _DEFAULT_LOAD_PLUGINS,
+        },
+    }),
+    template = "Plugins.def.in",
+)
+
+cc_library(
+    name = "PluginsConfig",
+    hdrs = [":plugins_config_gen"],
+    include_prefix = "Plugins",
+)
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/plugin_config.bzl b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/plugin_config.bzl
new file mode 100644
index 000000000000..5949d2d7a504
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/plugin_config.bzl
@@ -0,0 +1,104 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""Common configuration for LLDB plugins."""
+
+load("//:vars.bzl", "CMAKE_CXX_STANDARD")
+
+DEFAULT_PLUGINS = [
+    "ABIAArch64",
+    "ABIARM",
+    "ABIHexagon",
+    "ABIMips",
+    "ABIMSP430",
+    "ABIPowerPC",
+    "ABIRISCV",
+    "ABISystemZ",
+    "ABIX86",
+    "AppleObjCRuntime",
+    "ArchitectureAArch64",
+    "ArchitectureArm",
+    "ArchitectureMips",
+    "ArchitecturePPC64",
+    "ClangREPL",
+    "CPlusPlusLanguage",
+    "CXXItaniumABI",
+    "DisassemblerLLVMC",
+    "DynamicLoaderDarwinKernel",
+    "DynamicLoaderHexagonDYLD",
+    "DynamicLoaderMacOSXDYLD",
+    "DynamicLoaderPosixDYLD",
+    "DynamicLoaderStatic",
+    "DynamicLoaderWasmDYLD",
+    "DynamicLoaderWindowsDYLD",
+    "GNUstepObjCRuntime",
+    "InstructionARM",
+    "InstructionARM64",
+    "InstructionLoongArch",
+    "InstructionMIPS",
+    "InstructionMIPS64",
+    "InstructionPPC64",
+    "InstructionRISCV",
+    "InstrumentationRuntimeASan",
+    "InstrumentationRuntimeASanLibsanitizers",
+    "InstrumentationRuntimeMainThreadChecker",
+    "InstrumentationRuntimeTSan",
+    "InstrumentationRuntimeUBSan",
+    "JITLoaderGDB",
+    "MemoryHistoryASan",
+    "ObjCLanguage",
+    "ObjCPlusPlusLanguage",
+    "ObjectContainerBSDArchive",
+    "ObjectContainerMachOArchive",
+    "ObjectContainerMachOFileset",
+    "ObjectFileBreakpad",
+    "ObjectFileCOFF",
+    "ObjectFileELF",
+    "ObjectFileJSON",
+    "ObjectFileMachO",
+    "ObjectFileMinidump",
+    "ObjectFilePDB",
+    "ObjectFilePECOFF",
+    "ObjectFilePlaceholder",
+    "ObjectFileWasm",
+    "PlatformAndroid",
+    "PlatformGDB",
+    "PlatformLinux",
+    "PlatformMacOSX",
+    "PlatformQemuUser",
+    "PlatformWindows",
+    "ProcessElfCore",
+    "ProcessMachCore",
+    "ProcessMinidump",
+    "RegisterTypeBuilderClang",
+    "ScriptedProcess",
+    "StructuredDataDarwinLog",
+    "SymbolFileBreakpad",
+    "SymbolFileCTF",
+    "SymbolFileDWARF",
+    "SymbolFileJSON",
+    "SymbolFilePDB",
+    "SymbolFileSymtab",
+    "SymbolLocatorDebuginfod",
+    "SymbolLocatorDefault",
+    "SymbolVendorELF",
+    "SymbolVendorPECOFF",
+    "SymbolVendorWasm",
+    "SystemRuntimeMacOSX",
+    "TraceExporterCTF",
+    "TypeSystemClang",
+    "UnwindAssemblyInstEmulation",
+    "UnwindAssemblyX86",
+]
+
+DEFAULT_SCRIPT_PLUGINS = [
+    "ScriptInterpreterNone",
+]
+
+OBJCPP_COPTS = [
+    "-std=c++{}".format(CMAKE_CXX_STANDARD),
+    "-fno-objc-exceptions",
+    "-fno-objc-arc",
+    "-Wno-shorten-64-to-32",
+]
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index ddd3e69e6ce3..497edcfceffe 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4275,6 +4275,7 @@ cc_library(
         ":AffineDialect",
         ":Analysis",
         ":ArithDialect",
+        ":ArithUtils",
         ":DialectUtils",
         ":FuncDialect",
         ":IR",
author	Vitaly Buka <vitalybuka@google.com>	2024-04-04 17:42:50 -0700
committer	Vitaly Buka <vitalybuka@google.com>	2024-04-04 17:42:50 -0700
commit	4021d3899046d5a27ffe48c607a19c7d4a86df14 (patch)
tree	e89f3b2540893ace6ab1c54767ef62c0f178c2ea
parent	9877386e2390c42022a566f8fa6fd1f6df08c0ad (diff)
parent	a9d93873f857963eeb9ef7f65a725e6aaf99c958 (diff)