From a853d79963c6ac09154817690a8c317157345876 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 3 Apr 2024 18:48:30 -0700
Subject: [RISCV][GISel] Don't check for FP uses of of IMPLICIT_DEF if the type
 is vector. NFC

If the type is vector, we can immediately know to use vector mapping.
Previously we searched for FP uses, but then replaced it if the type
was vector.
---
 llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 4f34514c2cfe..8534024b6002 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -341,18 +341,17 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_IMPLICIT_DEF: {
     Register Dst = MI.getOperand(0).getReg();
     LLT DstTy = MRI.getType(Dst);
-    uint64_t DstMinSize = DstTy.getSizeInBits().getKnownMinValue();
+    unsigned DstMinSize = DstTy.getSizeInBits().getKnownMinValue();
     auto Mapping = GPRValueMapping;
     // FIXME: May need to do a better job determining when to use FPRB.
     // For example, the look through COPY case:
     // %0:_(s32) = G_IMPLICIT_DEF
     // %1:_(s32) = COPY %0
     // $f10_d = COPY %1(s32)
-    if (anyUseOnlyUseFP(Dst, MRI, TRI))
-      Mapping = getFPValueMapping(DstMinSize);
-
     if (DstTy.isVector())
       Mapping = getVRBValueMapping(DstMinSize);
+    else if (anyUseOnlyUseFP(Dst, MRI, TRI))
+      Mapping = getFPValueMapping(DstMinSize);
 
     return getInstructionMapping(DefaultMappingID, /*Cost=*/1, Mapping,
                                  NumOperands);
-- 
cgit v1.2.3


From a4c470555b5c311770e6cb58494c573c4efe53d6 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 4 Apr 2024 11:17:59 +0900
Subject: [mlir][linalg] Fix builder API usage in `RegionBuilderHelper`
 (#87451)

Operations must be created with the supplied builder. Otherwise, the
dialect conversion / greedy pattern rewrite driver can break.

This commit fixes a crash in the dialect conversion:
```
within split at llvm-project/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir:1 offset :8:8: error: failed to legalize operation 'tosa.add'
  %0 = tosa.add %1, %arg2 : (tensor<10x10xf32>, tensor<*xf32>) -> tensor<*xf32>
       ^
within split at llvm-project/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir:1 offset :8:8: note: see current operation: %9 = "tosa.add"(%8, %arg2) : (tensor<10x10xf32>, tensor<*xf32>) -> tensor<*xf32>
mlir-opt: llvm-project/mlir/include/mlir/IR/UseDefLists.h:198: mlir::IRObjectWithUseList<mlir::OpOperand>::~IRObjectWithUseList() [OperandType = mlir::OpOperand]: Assertion `use_empty() && "Cannot destroy a value that still has uses!"' failed.
```

This commit is the proper fix for #87297 (which was reverted).
---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp           | 36 +++++++++++-----------
 .../SparseTensor/Transforms/SparseTensorPasses.cpp |  2 +-
 .../TosaToLinalg/tosa-to-linalg-invalid.mlir       | 12 ++++++++
 .../mlir-linalg-ods-yaml-gen.cpp                   |  2 +-
 4 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 2d7219fef87c..9c5c58fa1fab 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -373,14 +373,15 @@ namespace {
 
 class RegionBuilderHelper {
 public:
-  RegionBuilderHelper(MLIRContext *context, Block &block)
-      : context(context), block(block) {}
+  RegionBuilderHelper(OpBuilder &builder, Block &block)
+      : builder(builder), block(block) {}
 
   // Build the unary functions defined by OpDSL.
   Value buildUnaryFn(UnaryFn unaryFn, Value arg) {
     if (!isFloatingPoint(arg))
       llvm_unreachable("unsupported non numeric type");
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     switch (unaryFn) {
     case UnaryFn::exp:
       return builder.create<math::ExpOp>(arg.getLoc(), arg);
@@ -407,7 +408,8 @@ public:
                    arg1.getType().getIntOrFloatBitWidth() == 1;
     if (!allComplex && !allFloatingPoint && !allInteger)
       llvm_unreachable("unsupported non numeric type");
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     switch (binaryFn) {
     case BinaryFn::add:
       if (allComplex)
@@ -481,29 +483,32 @@ public:
   }
 
   void yieldOutputs(ValueRange values) {
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     Location loc = builder.getUnknownLoc();
     builder.create<YieldOp>(loc, values);
   }
 
   Value constant(const std::string &value) {
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     Location loc = builder.getUnknownLoc();
     Attribute valueAttr = parseAttribute(value, builder.getContext());
     return builder.create<arith::ConstantOp>(loc, ::cast<TypedAttr>(valueAttr));
   }
 
   Value index(int64_t dim) {
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     return builder.create<IndexOp>(builder.getUnknownLoc(), dim);
   }
 
   Type getIntegerType(unsigned width) {
-    return IntegerType::get(context, width);
+    return IntegerType::get(builder.getContext(), width);
   }
 
-  Type getFloat32Type() { return Float32Type::get(context); }
-  Type getFloat64Type() { return Float64Type::get(context); }
+  Type getFloat32Type() { return Float32Type::get(builder.getContext()); }
+  Type getFloat64Type() { return Float64Type::get(builder.getContext()); }
 
 private:
   // Generates operations to cast the given operand to a specified type.
@@ -511,7 +516,8 @@ private:
   // operand returned as-is (which will presumably yield a verification
   // issue downstream).
   Value cast(Type toType, Value operand, bool isUnsignedCast) {
-    OpBuilder builder = getBuilder();
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(&block);
     auto loc = operand.getLoc();
     return convertScalarToDtype(builder, loc, operand, toType, isUnsignedCast);
   }
@@ -526,13 +532,7 @@ private:
     return llvm::isa<IntegerType>(value.getType());
   }
 
-  OpBuilder getBuilder() {
-    OpBuilder builder(context);
-    builder.setInsertionPointToEnd(&block);
-    return builder;
-  }
-
-  MLIRContext *context;
+  OpBuilder &builder;
   Block &block;
 };
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index d4c17928d4ca..acea25f02398 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -274,7 +274,7 @@ struct SparseTensorCodegenPass
         });
     // The following operations and dialects may be introduced by the
     // codegen rules, and are therefore marked as legal.
-    target.addLegalOp<linalg::FillOp>();
+    target.addLegalOp<linalg::FillOp, linalg::YieldOp>();
     target.addLegalDialect<
         arith::ArithDialect, bufferization::BufferizationDialect,
         complex::ComplexDialect, memref::MemRefDialect, scf::SCFDialect>();
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir
index 17eec5936918..ad65410e635e 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir
@@ -15,3 +15,15 @@ func.func @tensor_with_unknown_rank(%arg0: tensor<*xi8>) -> tensor<*xi8> {
   %0 = "tosa.abs"(%arg0) : (tensor<*xi8>) -> tensor<*xi8>
   return %0 : tensor<*xi8>
 }
+
+// -----
+
+// CHECK-LABEL: @unranked_add
+func.func @unranked_add(%arg0 : tensor<10x10xf32> , %arg1 : tensor<10x10xf32>, %arg2 : tensor<*xf32>) -> (tensor<10x10xf32>) {
+  // expected-error@+3 {{failed to legalize operation 'tosa.add'}}
+  %reduce = tosa.reduce_max %arg0 {axis = 1 : i32} : (tensor<10x10xf32>) -> tensor<10x1xf32>
+  %1 = tosa.add %reduce, %arg1 : (tensor<10x1xf32>, tensor<10x10xf32>) -> tensor<10x10xf32>
+  %0 = tosa.add %1, %arg2 : (tensor<10x10xf32>, tensor<*xf32>) -> tensor<*xf32>
+  %2 = tosa.reshape %0 {new_shape = array<i64: 10, 10>} : (tensor<*xf32>) -> tensor<10x10xf32>
+  return %2 : tensor<10x10xf32>
+}
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
index f14e559fff92..fe6ad1504112 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
@@ -1008,7 +1008,7 @@ void {0}::regionBuilder(ImplicitLocOpBuilder &b,
                         Block &block, ArrayRef<NamedAttribute> attrs) {{
   assert({1} > 0 && block.getNumArguments() == {1} &&
          "{0} regionBuilder expects {1} (>=0) args");
-  RegionBuilderHelper helper(block.getArgument(0).getContext(), block);
+  RegionBuilderHelper helper(b, block);
   SmallVector<Value> yields;
   {2}
   {3}
-- 
cgit v1.2.3


From fb635be0b8a3e14ca38f3f74e8224f9e0f716a10 Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Thu, 4 Apr 2024 06:03:00 +0300
Subject: [libc++] P2867R1: Remove Deprecated `strstream`s From C++26 (#87107)

Implements: https://wg21.link/P2867R2

---------

Co-authored-by: Hristo Hristov <zingam@outlook.com>
---
 libcxx/docs/ReleaseNotes/19.rst                              |  3 +++
 libcxx/docs/Status/Cxx2cPapers.csv                           |  2 +-
 libcxx/docs/UsingLibcxx.rst                                  |  5 ++++-
 libcxx/include/strstream                                     | 12 ++++++++----
 libcxx/modules/std/strstream.inc                             |  2 ++
 .../depr.istrstream/depr.istrstream.cons/ccp.pass.cpp        |  2 +-
 .../depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp   |  2 +-
 .../depr.istrstream/depr.istrstream.cons/cp.pass.cpp         |  2 +-
 .../depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp    |  2 +-
 .../depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp   |  2 +-
 .../depr.istrstream/depr.istrstream.members/str.pass.cpp     |  2 +-
 .../depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp |  2 ++
 .../depr/depr.str.strstreams/depr.istrstream/types.pass.cpp  |  2 +-
 .../depr.ostrstream.cons/cp_size_mode.pass.cpp               |  2 +-
 .../depr.ostrstream/depr.ostrstream.cons/default.pass.cpp    |  2 +-
 .../depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp  |  2 +-
 .../depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp  |  2 +-
 .../depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp   |  2 +-
 .../depr.ostrstream/depr.ostrstream.members/str.pass.cpp     |  2 +-
 .../depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp |  2 ++
 .../depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp  |  2 +-
 .../depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp |  2 +-
 .../depr.strstream/depr.strstream.cons/default.pass.cpp      |  2 +-
 .../depr.strstream/depr.strstream.dest/rdbuf.pass.cpp        |  2 +-
 .../depr.strstream/depr.strstream.oper/freeze.pass.cpp       |  2 +-
 .../depr.strstream/depr.strstream.oper/pcount.pass.cpp       |  2 +-
 .../depr.strstream/depr.strstream.oper/str.pass.cpp          |  2 +-
 .../depr/depr.str.strstreams/depr.strstream/depr.verify.cpp  |  2 ++
 .../depr/depr.str.strstreams/depr.strstream/types.pass.cpp   |  2 +-
 .../depr.strstreambuf.cons/ccp_size.pass.cpp                 |  2 +-
 .../depr.strstreambuf.cons/cp_size_cp.pass.cpp               |  2 +-
 .../depr.strstreambuf.cons/cscp_size.pass.cpp                |  2 +-
 .../depr.strstreambuf.cons/cucp_size.pass.cpp                |  2 +-
 .../depr.strstreambuf.cons/custom_alloc.pass.cpp             |  2 +-
 .../depr.strstreambuf.cons/default.pass.cpp                  |  2 +-
 .../depr.strstreambuf.cons/scp_size_scp.pass.cpp             |  2 +-
 .../depr.strstreambuf.cons/ucp_size_ucp.pass.cpp             |  2 +-
 .../depr.strstreambuf.members/freeze.pass.cpp                |  2 +-
 .../depr.strstreambuf.members/overflow.pass.cpp              |  2 +-
 .../depr.strstreambuf.members/pcount.pass.cpp                |  2 +-
 .../depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp |  2 +-
 .../depr.strstreambuf.virtuals/overflow.pass.cpp             |  2 +-
 .../depr.strstreambuf.virtuals/pbackfail.pass.cpp            |  2 +-
 .../depr.strstreambuf.virtuals/seekoff.pass.cpp              |  2 +-
 .../depr.strstreambuf.virtuals/seekpos.pass.cpp              |  2 +-
 .../depr.strstreambuf.virtuals/setbuf.pass.cpp               |  2 +-
 .../depr.strstreambuf.virtuals/underflow.pass.cpp            |  2 +-
 .../depr.str.strstreams/depr.strstreambuf/depr.verify.cpp    |  2 ++
 .../depr.str.strstreams/depr.strstreambuf/types.pass.cpp     |  2 +-
 49 files changed, 66 insertions(+), 46 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index dd39c1bbbc78..2da9df54a531 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -42,6 +42,7 @@ Implemented Papers
 - P2652R2 - Disallow User Specialization of ``allocator_traits``
 - P2819R2 - Add ``tuple`` protocol to ``complex``
 - P2495R3 - Interfacing ``stringstream``\s with ``string_view``
+- P2867R2 - Remove Deprecated ``strstream``\s From C++26
 - P2302R4 - ``std::ranges::contains``
 - P1659R3 - ``std::ranges::starts_with`` and ``std::ranges::ends_with``
 
@@ -54,6 +55,8 @@ Improvements and New Features
 - The ``std::mismatch`` algorithm has been optimized for integral types, which can lead up to 40x performance
   improvements.
 
+- The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM`` macro has been added to make the declarations in ``<strstream>`` available.
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 6e82086d5469..a34dad581680 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -47,7 +47,7 @@
 "`P1673R13 <https://wg21.link/P1673R13>`__","LWG","A free function linear algebra interface based on the BLAS","Kona November 2023","","",""
 "","","","","","",""
 "`P2875R4 <https://wg21.link/P2875R4>`__","LWG","Undeprecate ``polymorphic_allocator::destroy`` for C++26","Tokyo March 2024","|Complete|","15.0",""
-"`P2867R2 <https://wg21.link/P2867R2>`__","LWG","Remove Deprecated ``strstreams`` From C++26","Tokyo March 2024","","",""
+"`P2867R2 <https://wg21.link/P2867R2>`__","LWG","Remove Deprecated ``strstreams`` From C++26","Tokyo March 2024","|Complete|","19.0",""
 "`P2869R4 <https://wg21.link/P2869R4>`__","LWG","Remove Deprecated ``shared_ptr`` Atomic Access APIs from C++26","Tokyo March 2024","","",""
 "`P2872R3 <https://wg21.link/P2872R3>`__","LWG","Remove ``wstring_convert`` From C++26","Tokyo March 2024","","",""
 "`P3107R5 <https://wg21.link/P3107R5>`__","LWG","Permit an efficient implementation of ``std::print``","Tokyo March 2024","","","|format| |DR|"
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index ac12b0b96950..8a1c747a2541 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -272,7 +272,10 @@ C++26 Specific Configuration Macros
   ``std::basic_string<...>::reserve()``.
 
 **_LIBCPP_ENABLE_CXX26_REMOVED_ALLOCATOR_MEMBERS**:
-  This macro is used to re-enable redundant member of ``allocator<T>::is_always_equal``
+  This macro is used to re-enable redundant member of ``allocator<T>::is_always_equal``.
+
+**_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM**:
+  This macro is used to re-enable all named declarations in ``<strstream>``.
 
 Libc++ Extensions
 =================
diff --git a/libcxx/include/strstream b/libcxx/include/strstream
index e9f533644f78..c8df6eb0da03 100644
--- a/libcxx/include/strstream
+++ b/libcxx/include/strstream
@@ -13,7 +13,7 @@
 /*
     strstream synopsis
 
-class strstreambuf
+class strstreambuf                                    // Removed in C++26
     : public basic_streambuf<char>
 {
 public:
@@ -63,7 +63,7 @@ private:
     void (*pfree)(void*);               // exposition only
 };
 
-class istrstream
+class istrstream                                      // Removed in C++26
     : public basic_istream<char>
 {
 public:
@@ -81,7 +81,7 @@ private:
     strstreambuf sb; // exposition only
 };
 
-class ostrstream
+class ostrstream                                      // Removed in C++26
     : public basic_ostream<char>
 {
 public:
@@ -99,7 +99,7 @@ private:
     strstreambuf sb; // exposition only
 };
 
-class strstream
+class strstream                                       // Removed in C++26
     : public basic_iostream<char>
 {
 public:
@@ -138,6 +138,8 @@ private:
 #  pragma GCC system_header
 #endif
 
+#if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM) || defined(_LIBCPP_BUILDING_LIBRARY)
+
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
@@ -344,4 +346,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
+#endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM) || defined(_LIBCPP_BUILDING_LIBRARY)
+
 #endif // _LIBCPP_STRSTREAM
diff --git a/libcxx/modules/std/strstream.inc b/libcxx/modules/std/strstream.inc
index a33c514f1502..808796701aba 100644
--- a/libcxx/modules/std/strstream.inc
+++ b/libcxx/modules/std/strstream.inc
@@ -9,9 +9,11 @@
 
 export namespace std {
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
+#  if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM)
   using std::istrstream;
   using std::ostrstream;
   using std::strstream;
   using std::strstreambuf;
+#  endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM)
 #endif // _LIBCPP_HAS_NO_LOCALIZATION
 } // namespace std
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp
index b5ee0bfbecf0..701e6dfde57a 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp
index 4d0d6731338a..fd2ad66e108b 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp
index 58980949732d..c5fe349f24d7 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp
index e13e20e20f8c..7d9c7d6c4200 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp
index 449114a0f6a9..f5ef29bcdbe5 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp
index e7c063750fb6..63d3800290e1 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp
index 2ab252e93494..d02f12d9ebb1 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
+
 // <strstream>
 
 // check that istrstream is marked deprecated
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp
index be1a9e1251ba..526c4dc8b41d 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp
index 8698983a7ebc..b9db1bf444ed 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp
index abbf6af7e11e..b67f0ab16e92 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp
index 854e68b17249..e087c0686f09 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp
index 9830aeb6fc8c..73f20335d03d 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp
index f9a859dd8dab..d8b55d90315e 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp
index 72f665af5851..28670315339e 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp
index e0c805fee107..9ec46509faad 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
+
 // <strstream>
 
 // check that ostrstream is marked deprecated
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp
index 6a71c44a90cf..e321444924fd 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp
index a85e1328b351..54e09d3f30db 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp
index 390162ef0f18..1e4c1209f711 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp
index 3fe277a48cbb..99d58cc57351 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp
index 263fddef6c34..6cc26bba7d02 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp
index b053cf128ab4..efe2c324eb31 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp
index 3d251d9a99cd..e1bc40c86eaf 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp
index 03655226a54b..ab88d6d49a11 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
+
 // <strstream>
 
 // check that strstream is marked deprecated
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp
index fb543841e8f3..76094302a0cd 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp
index 8f81707973d8..3d7f2ffcfead 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp
index 25a9617dcdaf..0c4bf6214f7f 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp
index fc3386ff5575..e09288354472 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp
index a74c504fd278..7f5504dac1b1 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp
index 756427df4820..0aa7e1afcb17 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp
index 81924c995b15..803be42fe756 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp
index b8991a8fc433..35c1512e0a1d 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp
index 1d3463f3cac1..e71fa031ebcb 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp
index 93eec8dde397..1276187d7f3e 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp
index 5b973cff0ce0..fc79e787d38f 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp
index b64c9dcb4447..b62c33924188 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp
index d6c8b8e7e11b..68be8ddec035 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp
index 37109c7e942c..6a932dc00afa 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp
index 698953f7121a..484a726073aa 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp
index d98e6f73f7aa..96900d89e402 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp
index be88f5aecc5c..f3193d352049 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp
index ce7612bc66a9..44e6704dccb4 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp
index 4fc79b575c7a..be916be479e2 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp
index a598acbba8c8..471b287e7f8a 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
+
 // <strstream>
 
 // check that strstreambuf is marked deprecated
diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp
index bc312cbbb225..aee126056c0e 100644
--- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp
+++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM
 
 // <strstream>
 
-- 
cgit v1.2.3


From 07d5f491867d390b9aee33035c187e27cf0746a0 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 4 Apr 2024 11:30:23 +0800
Subject: [RISCV] Add patterns for fixed vector vwsll (#87316)

Fixed vectors have their sext/zext operands legalized to _VL nodes, so
we need to handle them in the patterns.

This adds a riscv_ext_vl_oneuse pattern since we don't care about the
type of extension used for the shift amount, and extends
Low8BitsSplatPat to handle other _VL nodes. We don't actually need to
check the mask or VL there since none of the _VL nodes have passthru
operands.

The remaining test cases that are widening from i8->i64 need to be
handled by extending combineBinOp_VLToVWBinOp_VL.

This also fixes Low8BitsSplatPat incorrectly checking the vector size
instead of the element size to determine if the splat value might have
been truncated below 8 bits.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp        |  30 ++--
 llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td |   8 +
 llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td         |  35 ++++
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll | 183 ++++++++-------------
 4 files changed, 130 insertions(+), 126 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 55ba4949b3ea..f99dc0b85763 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3287,24 +3287,24 @@ bool RISCVDAGToDAGISel::selectVSplatUimm(SDValue N, unsigned Bits,
 }
 
 bool RISCVDAGToDAGISel::selectLow8BitsVSplat(SDValue N, SDValue &SplatVal) {
-  // Truncates are custom lowered during legalization.
-  auto IsTrunc = [this](SDValue N) {
-    if (N->getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
+  auto IsExtOrTrunc = [](SDValue N) {
+    switch (N->getOpcode()) {
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    // There's no passthru on these _VL nodes so any VL/mask is ok, since any
+    // inactive elements will be undef.
+    case RISCVISD::TRUNCATE_VECTOR_VL:
+    case RISCVISD::VSEXT_VL:
+    case RISCVISD::VZEXT_VL:
+      return true;
+    default:
       return false;
-    SDValue VL;
-    selectVLOp(N->getOperand(2), VL);
-    // Any vmset_vl is ok, since any bits past VL are undefined and we can
-    // assume they are set.
-    return N->getOperand(1).getOpcode() == RISCVISD::VMSET_VL &&
-           isa<ConstantSDNode>(VL) &&
-           cast<ConstantSDNode>(VL)->getSExtValue() == RISCV::VLMaxSentinel;
+    }
   };
 
-  // We can have multiple nested truncates, so unravel them all if needed.
-  while (N->getOpcode() == ISD::SIGN_EXTEND ||
-         N->getOpcode() == ISD::ZERO_EXTEND || IsTrunc(N)) {
-    if (!N.hasOneUse() ||
-        N.getValueType().getSizeInBits().getKnownMinValue() < 8)
+  // We can have multiple nested nodes, so unravel them all if needed.
+  while (IsExtOrTrunc(N)) {
+    if (!N.hasOneUse() || N.getScalarValueSizeInBits() < 8)
       return false;
     N = N->getOperand(0);
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index cc44092700c6..73d52d5ecafb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -387,6 +387,9 @@ def SDT_RISCVVEXTEND_VL : SDTypeProfile<1, 3, [SDTCisVec<0>,
                                                SDTCisVT<3, XLenVT>]>;
 def riscv_sext_vl : SDNode<"RISCVISD::VSEXT_VL", SDT_RISCVVEXTEND_VL>;
 def riscv_zext_vl : SDNode<"RISCVISD::VZEXT_VL", SDT_RISCVVEXTEND_VL>;
+def riscv_ext_vl : PatFrags<(ops node:$A, node:$B, node:$C),
+                            [(riscv_sext_vl node:$A, node:$B, node:$C),
+                             (riscv_zext_vl node:$A, node:$B, node:$C)]>;
 
 def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL",
                                    SDTypeProfile<1, 3, [SDTCisVec<0>,
@@ -535,6 +538,11 @@ def riscv_zext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
   return N->hasOneUse();
 }]>;
 
+def riscv_ext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
+                          (riscv_ext_vl node:$A, node:$B, node:$C), [{
+  return N->hasOneUse();
+}]>;
+
 def riscv_fpextend_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C),
                            (riscv_fpextend_vl node:$A, node:$B, node:$C), [{
   return N->hasOneUse();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 51a7a0a15d97..c1facc790fc0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -629,6 +629,19 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
+    def : Pat<(riscv_shl_vl
+                 (wti.Vector (riscv_zext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs2),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector (riscv_ext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs1),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector wti.RegClass:$merge),
+                 (vti.Mask V0), VLOpFrag),
+              (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
+                 wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
     def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
@@ -638,6 +651,17 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
+    def : Pat<(riscv_shl_vl
+                 (wti.Vector (riscv_zext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs2),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
+                 (wti.Vector wti.RegClass:$merge),
+                 (vti.Mask V0), VLOpFrag),
+              (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
+                 wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
     def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
@@ -647,6 +671,17 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
+    def : Pat<(riscv_shl_vl
+                 (wti.Vector (riscv_zext_vl_oneuse
+                                (vti.Vector vti.RegClass:$rs2),
+                                (vti.Mask V0), VLOpFrag)),
+                 (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
+                 (wti.Vector wti.RegClass:$merge),
+                 (vti.Mask V0), VLOpFrag),
+              (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
+                 wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
+
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector vti.RegClass:$rs1),
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
index f5305a1c36de..83d1d1b3f94c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
@@ -19,10 +19,9 @@ define <4 x i64> @vwsll_vv_v4i64_sext(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %y = sext <4 x i32> %b to <4 x i64>
@@ -41,10 +40,9 @@ define <4 x i64> @vwsll_vv_v4i64_zext(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %y = zext <4 x i32> %b to <4 x i64>
@@ -62,9 +60,9 @@ define <4 x i64> @vwsll_vx_i64_v4i64(<4 x i32> %a, i64 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <4 x i64> %head, <4 x i64> poison, <4 x i32> zeroinitializer
@@ -88,10 +86,8 @@ define <4 x i64> @vwsll_vx_i32_v4i64_sext(<4 x i32> %a, i32 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -116,10 +112,8 @@ define <4 x i64> @vwsll_vx_i32_v4i64_zext(<4 x i32> %a, i32 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -142,12 +136,9 @@ define <4 x i64> @vwsll_vx_i16_v4i64_sext(<4 x i32> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -170,12 +161,9 @@ define <4 x i64> @vwsll_vx_i16_v4i64_zext(<4 x i32> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -198,12 +186,9 @@ define <4 x i64> @vwsll_vx_i8_v4i64_sext(<4 x i32> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -226,12 +211,9 @@ define <4 x i64> @vwsll_vx_i8_v4i64_zext(<4 x i32> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -251,9 +233,9 @@ define <4 x i64> @vwsll_vi_v4i64(<4 x i32> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %z = shl <4 x i64> %x, splat (i64 2)
@@ -275,10 +257,9 @@ define <8 x i32> @vwsll_vv_v8i32_sext(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %y = sext <8 x i16> %b to <8 x i32>
@@ -297,10 +278,9 @@ define <8 x i32> @vwsll_vv_v8i32_zext(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %y = zext <8 x i16> %b to <8 x i32>
@@ -318,9 +298,9 @@ define <8 x i32> @vwsll_vx_i64_v8i32(<8 x i16> %a, i64 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <8 x i64> %head, <8 x i64> poison, <8 x i32> zeroinitializer
@@ -340,9 +320,9 @@ define <8 x i32> @vwsll_vx_i32_v8i32(<8 x i16> %a, i32 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <8 x i32> %head, <8 x i32> poison, <8 x i32> zeroinitializer
@@ -366,10 +346,8 @@ define <8 x i32> @vwsll_vx_i16_v8i32_sext(<8 x i16> %a, i16 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -394,10 +372,8 @@ define <8 x i32> @vwsll_vx_i16_v8i32_zext(<8 x i16> %a, i16 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -420,12 +396,9 @@ define <8 x i32> @vwsll_vx_i8_v8i32_sext(<8 x i16> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -448,12 +421,9 @@ define <8 x i32> @vwsll_vx_i8_v8i32_zext(<8 x i16> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -473,9 +443,9 @@ define <8 x i32> @vwsll_vi_v8i32(<8 x i16> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %z = shl <8 x i32> %x, splat (i32 2)
@@ -497,10 +467,9 @@ define <16 x i16> @vwsll_vv_v16i16_sext(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %y = sext <16 x i8> %b to <16 x i16>
@@ -519,10 +488,9 @@ define <16 x i16> @vwsll_vv_v16i16_zext(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %y = zext <16 x i8> %b to <16 x i16>
@@ -552,12 +520,9 @@ define <16 x i16> @vwsll_vx_i32_v16i16(<16 x i8> %a, i32 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v12, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v8
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <16 x i32> %head, <16 x i32> poison, <16 x i32> zeroinitializer
@@ -577,9 +542,9 @@ define <16 x i16> @vwsll_vx_i16_v16i16(<16 x i8> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <16 x i16> %head, <16 x i16> poison, <16 x i32> zeroinitializer
@@ -603,10 +568,8 @@ define <16 x i16> @vwsll_vx_i8_v16i16_sext(<16 x i8> %a, i8 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -631,10 +594,8 @@ define <16 x i16> @vwsll_vx_i8_v16i16_zext(<16 x i8> %a, i8 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -654,9 +615,9 @@ define <16 x i16> @vwsll_vi_v16i16(<16 x i8> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %z = shl <16 x i16> %x, splat (i16 2)
-- 
cgit v1.2.3


From 698bf3dafcc0dfa15540ae7f1f9b72208a578bd2 Mon Sep 17 00:00:00 2001
From: Sourabh Singh Tomar <sourabhsingh.tomar@amd.com>
Date: Thu, 4 Apr 2024 09:27:57 +0530
Subject: [flang][OpenMP] Fix for #86393 (#87452)

---
 flang/lib/Lower/OpenMP/ReductionProcessor.cpp      |   6 +-
 .../test/Lower/OpenMP/parallel-reduction-array.f90 |   2 +-
 .../Lower/OpenMP/parallel-reduction-array2.f90     |   2 +-
 flang/test/Lower/OpenMP/parallel-reduction3.f90    | 125 +++++++++++++++++++++
 flang/test/Lower/OpenMP/wsloop-reduction-array.f90 |   2 +-
 .../test/Lower/OpenMP/wsloop-reduction-array2.f90  |   2 +-
 6 files changed, 134 insertions(+), 5 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/parallel-reduction3.f90

diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 0d05ca5aee65..6a8447a00e4c 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -13,6 +13,7 @@
 #include "ReductionProcessor.h"
 
 #include "flang/Lower/AbstractConverter.h"
+#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
@@ -527,7 +528,10 @@ void ReductionProcessor::addDeclareReduction(
     // all arrays must be boxed so that we have convenient access to all the
     // information needed to iterate over the array
     if (mlir::isa<fir::SequenceType>(redType.getEleTy())) {
-      hlfir::Entity entity{symVal};
+      // For Host associated symbols, use `SymbolBox` instead
+      Fortran::lower::SymbolBox symBox =
+          converter.lookupOneLevelUpSymbol(*symbol);
+      hlfir::Entity entity{symBox.getAddr()};
       entity = genVariableBox(currentLocation, builder, entity);
       mlir::Value box = entity.getBase();
 
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
index 735a99854308..56dcabbb75c3 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
@@ -50,7 +50,7 @@ end program
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
-! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#1(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
 ! CHECK:           fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
 ! CHECK:           omp.parallel byref reduction(@add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
index 4834047a98a4..94bff410a2f0 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
@@ -50,7 +50,7 @@ end program
 ! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
-! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#1(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
 ! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
 ! CHECK:           fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
 ! CHECK:           omp.parallel byref reduction(@add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90
new file mode 100644
index 000000000000..b25759713e31
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90
@@ -0,0 +1,125 @@
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! The script is designed to make adding checks to
+! a test case fast, it is *not* designed to be authoritative
+! about what constitutes a good test! The CHECK should be
+! minimized and named to reflect the test intent.
+
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+
+
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.array<?xi32>, %[[VAL_4]]#1 {bindc_name = ".tmp"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+! CHECK:           hlfir.assign %[[VAL_1]] to %[[VAL_7]]#0 : i32, !fir.box<!fir.array<?xi32>>
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+! CHECK:           fir.store %[[VAL_7]]#0 to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>):
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]]#0, %[[VAL_5]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[VAL_7]] to %[[VAL_5]]#1 step %[[VAL_7]] unordered {
+! CHECK:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<?xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<?xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
+! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
+! CHECK:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
+! CHECK:           }
+! CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPs(
+! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsEi"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i64) -> index
+! CHECK:           %[[VAL_7:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_8:.*]] = arith.cmpi sgt, %[[VAL_6]], %[[VAL_7]] : index
+! CHECK:           %[[VAL_9:.*]] = arith.select %[[VAL_8]], %[[VAL_6]], %[[VAL_7]] : index
+! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.array<?xi32>, %[[VAL_9]] {bindc_name = "c", uniq_name = "_QFsEc"}
+! CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_11]]) {uniq_name = "_QFsEc"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_12]]#0 : i32, !fir.box<!fir.array<?xi32>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_17:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_19:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+! CHECK:             fir.store %[[VAL_12]]#0 to %[[VAL_19]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_box_Uxi32 %[[VAL_19]] -> %[[VAL_20:.*]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)  for  (%[[VAL_21:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
+! CHECK:               fir.store %[[VAL_21]] to %[[VAL_15]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFsEc"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> (!fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.array<?xi32>>>)
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = arith.constant 0 : index
+! CHECK:               %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_23]], %[[VAL_25]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:               %[[VAL_27:.*]] = fir.shape %[[VAL_26]]#1 : (index) -> !fir.shape<1>
+! CHECK:               %[[VAL_28:.*]] = hlfir.elemental %[[VAL_27]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+! CHECK:               ^bb0(%[[VAL_29:.*]]: index):
+! CHECK:                 %[[VAL_30:.*]] = arith.constant 0 : index
+! CHECK:                 %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_23]], %[[VAL_30]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+! CHECK:                 %[[VAL_32:.*]] = arith.constant 1 : index
+! CHECK:                 %[[VAL_33:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_32]] : index
+! CHECK:                 %[[VAL_34:.*]] = arith.addi %[[VAL_29]], %[[VAL_33]] : index
+! CHECK:                 %[[VAL_35:.*]] = hlfir.designate %[[VAL_23]] (%[[VAL_34]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK:                 %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<i32>
+! CHECK:                 %[[VAL_37:.*]] = arith.addi %[[VAL_36]], %[[VAL_24]] : i32
+! CHECK:                 hlfir.yield_element %[[VAL_37]] : i32
+! CHECK:               }
+! CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+! CHECK:               hlfir.assign %[[VAL_28]] to %[[VAL_38]] : !hlfir.expr<?xi32>, !fir.box<!fir.array<?xi32>>
+! CHECK:               hlfir.destroy %[[VAL_28]] : !hlfir.expr<?xi32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           %[[VAL_39:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_40:.*]] = hlfir.designate %[[VAL_12]]#0 (%[[VAL_39]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+! CHECK:           %[[VAL_41:.*]] = fir.load %[[VAL_40]] : !fir.ref<i32>
+! CHECK:           %[[VAL_42:.*]] = arith.constant 5050 : i32
+! CHECK:           %[[VAL_43:.*]] = arith.cmpi ne, %[[VAL_41]], %[[VAL_42]] : i32
+! CHECK:           cf.cond_br %[[VAL_43]], ^bb1, ^bb2
+! CHECK:         ^bb1:
+! CHECK:           %[[VAL_44:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_45:.*]] = arith.constant false
+! CHECK:           %[[VAL_46:.*]] = arith.constant false
+! CHECK:           %[[VAL_47:.*]] = fir.call @_FortranAStopStatement(%[[VAL_44]], %[[VAL_45]], %[[VAL_46]]) fastmath<contract> : (i32, i1, i1) -> none
+! CHECK:           fir.unreachable
+! CHECK:         ^bb2:
+! CHECK:           return
+! CHECK:         }
+! CHECK:         func.func private @_FortranAStopStatement(i32, i1, i1) -> none attributes {fir.runtime}
+
+subroutine s(x)
+    integer :: x
+    integer :: c(x)
+    c = 0
+    !$omp parallel do reduction(+:c)
+    do i = 1, 100
+        c = c + i
+    end do
+    !$omp end parallel do
+
+    if (c(1) /= 5050) stop 1
+end subroutine s
\ No newline at end of file
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
index a20ed1ca83ce..a898204c881d 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
@@ -60,7 +60,7 @@ end program
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_5]]#1(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_5]]#0(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
 ! CHECK:             %[[VAL_12:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
 ! CHECK:             fir.store %[[VAL_11]] to %[[VAL_12]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
 ! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_box_2xi32 %[[VAL_12]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)  for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
index 61599876da8e..f3745c846091 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
@@ -60,7 +60,7 @@ end program
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_5]]#1(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_5]]#0(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
 ! CHECK:             %[[VAL_12:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
 ! CHECK:             fir.store %[[VAL_11]] to %[[VAL_12]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
 ! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_box_2xi32 %[[VAL_12]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)  for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-- 
cgit v1.2.3


From e8aaa3eaedc3b5519747a68e772f0bc664b89154 Mon Sep 17 00:00:00 2001
From: Gulfem Savrun Yeniceri <gulfem@google.com>
Date: Thu, 4 Apr 2024 04:12:36 +0000
Subject: Revert "[libc] Added transitive bindings for OffsetType (#87397)"

This reverts commit 3ee93f486293420852fb9ec95af9c5f54cecdb08
because it broke Fuchsia Clang toolchain builders:
https://logs.chromium.org/logs/fuchsia/buildbucket/cr-buildbucket/8751633430491432833/+/u/clang/build/stdout
---
 libc/config/baremetal/api.td |  5 +----
 libc/config/gpu/api.td       |  6 +-----
 libc/config/linux/api.td     | 12 ++----------
 libc/include/CMakeLists.txt  | 10 ++++------
 libc/spec/posix.td           |  7 ++-----
 libc/src/stdio/fseeko.h      |  1 +
 libc/src/stdio/ftello.h      |  1 +
 7 files changed, 12 insertions(+), 30 deletions(-)

diff --git a/libc/config/baremetal/api.td b/libc/config/baremetal/api.td
index 690edbda1311..25aa06aacb64 100644
--- a/libc/config/baremetal/api.td
+++ b/libc/config/baremetal/api.td
@@ -57,10 +57,7 @@ def MathAPI : PublicAPI<"math.h"> {
 }
 
 def StdIOAPI : PublicAPI<"stdio.h"> {
-  let Types = [
-    "size_t",
-    "off_t",
-  ];
+  let Types = ["size_t"];
 }
 
 def StdlibAPI : PublicAPI<"stdlib.h"> {
diff --git a/libc/config/gpu/api.td b/libc/config/gpu/api.td
index 523ad49ffa3f..adaf5bfd747a 100644
--- a/libc/config/gpu/api.td
+++ b/libc/config/gpu/api.td
@@ -64,11 +64,7 @@ def StdIOAPI : PublicAPI<"stdio.h"> {
     SimpleMacroDef<"_IOLBF", "1">,
     SimpleMacroDef<"_IONBF", "2">,
   ];
-  let Types = [
-    "FILE",
-    "off_t",
-    "size_t",
-  ];
+  let Types = ["size_t", "FILE"];
 }
 
 def IntTypesAPI : PublicAPI<"inttypes.h"> {
diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
index 9964971f191b..eb5ed8089850 100644
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -49,10 +49,7 @@ def CTypeAPI : PublicAPI<"ctype.h"> {
 }
 
 def FCntlAPI : PublicAPI<"fcntl.h"> {
-  let Types = [
-    "mode_t",
-    "off_t",
-  ];
+  let Types = ["mode_t"];
 }
 
 def IntTypesAPI : PublicAPI<"inttypes.h"> {
@@ -80,12 +77,7 @@ def StdIOAPI : PublicAPI<"stdio.h"> {
     SimpleMacroDef<"_IOLBF", "1">,
     SimpleMacroDef<"_IONBF", "2">,
   ];
-  let Types = [
-    "FILE",
-    "cookie_io_functions_t",
-    "off_t",
-    "size_t",
-  ];
+  let Types = ["size_t", "FILE", "cookie_io_functions_t"];
 }
 
 def StdlibAPI : PublicAPI<"stdlib.h"> {
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 02c7dc8fbc0b..4203f0bc901b 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -41,10 +41,9 @@ add_gen_header(
   DEF_FILE fcntl.h.def
   GEN_HDR fcntl.h
   DEPENDS
+    .llvm_libc_common_h
     .llvm-libc-macros.fcntl_macros
     .llvm-libc-types.mode_t
-    .llvm-libc-types.off_t
-    .llvm_libc_common_h
 )
 
 add_gen_header(
@@ -265,14 +264,13 @@ add_gen_header(
   DEF_FILE stdio.h.def
   GEN_HDR stdio.h
   DEPENDS
+    .llvm_libc_common_h
     .llvm-libc-macros.file_seek_macros
     .llvm-libc-macros.stdio_macros
-    .llvm-libc-types.FILE
-    .llvm-libc-types.cookie_io_functions_t
-    .llvm-libc-types.off_t
     .llvm-libc-types.size_t
     .llvm-libc-types.ssize_t
-    .llvm_libc_common_h
+    .llvm-libc-types.FILE
+    .llvm-libc-types.cookie_io_functions_t
 )
 
 add_gen_header(
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
index 45f7ecfe84e9..cfa8d3afedde 100644
--- a/libc/spec/posix.td
+++ b/libc/spec/posix.td
@@ -210,10 +210,7 @@ def POSIX : StandardSpec<"POSIX"> {
   HeaderSpec FCntl = HeaderSpec<
     "fcntl.h",
     [], // Macros
-    [
-        ModeTType,
-        OffTType,
-    ],
+    [ModeTType],
     [], // Enumerations
     [
       FunctionSpec<
@@ -1183,7 +1180,7 @@ def POSIX : StandardSpec<"POSIX"> {
   HeaderSpec StdIO = HeaderSpec<
       "stdio.h",
       [], // Macros
-      [OffTType], // Types
+      [], // Types
       [], // Enumerations
       [
           FunctionSpec<
diff --git a/libc/src/stdio/fseeko.h b/libc/src/stdio/fseeko.h
index 77fb41215c31..3202ed2f97d0 100644
--- a/libc/src/stdio/fseeko.h
+++ b/libc/src/stdio/fseeko.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_STDIO_FSEEKO_H
 
 #include <stdio.h>
+#include <unistd.h>
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdio/ftello.h b/libc/src/stdio/ftello.h
index 5ab17f9244a5..0fdf13ab6bdb 100644
--- a/libc/src/stdio/ftello.h
+++ b/libc/src/stdio/ftello.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_STDIO_FTELLO_H
 
 #include <stdio.h>
+#include <unistd.h>
 
 namespace LIBC_NAMESPACE {
 
-- 
cgit v1.2.3


From 3a7b5223a6639e497c856368da11b5d74ec9d6e8 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 4 Apr 2024 12:36:15 +0800
Subject: [DAGCombiner][RISCV] Handle truncating splats in isNeutralConstant
 (#87338)

On RV64, we legalize zexts of i1s to (vselect m, (splat_vector i64 1),
(splat_vector i64 0)), where the splat_vectors are implicitly
truncating.

When the vselect is used by a binop we want to pull the vselect out via
foldSelectWithIdentityConstant. But because vectors with an element size
< i64 will truncate, isNeutralConstant will return false.

This patch handles truncating splats by getting the APInt value and
truncating it. We almost don't need to do this since most of the neutral
elements are either one/zero/all ones, but it will make a difference for
smax and smin.

I wasn't able to figure out a way to write the tests in terms of select,
since we need the i1 zext legalization to create a truncating
splat_vector.

This supercedes #87236. Fixed vectors are unfortunately not handled by
this patch (since they get legalized to _VL nodes), but they don't seem
to appear in the wild.
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  18 +-
 .../CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll    |  21 +--
 .../CodeGen/RISCV/rvv/fold-binop-into-select.ll    |  60 ++++++
 .../RISCV/rvv/vscale-vw-web-simplification.ll      | 209 ++++++++++-----------
 4 files changed, 181 insertions(+), 127 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8c543ae8038a..25b51d5c1e02 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -11545,30 +11545,32 @@ bool llvm::isNeutralConstant(unsigned Opcode, SDNodeFlags Flags, SDValue V,
                              unsigned OperandNo) {
   // NOTE: The cases should match with IR's ConstantExpr::getBinOpIdentity().
   // TODO: Target-specific opcodes could be added.
-  if (auto *Const = isConstOrConstSplat(V)) {
+  if (auto *ConstV = isConstOrConstSplat(V, /*AllowUndefs*/ false,
+                                         /*AllowTruncation*/ true)) {
+    APInt Const = ConstV->getAPIntValue().trunc(V.getScalarValueSizeInBits());
     switch (Opcode) {
     case ISD::ADD:
     case ISD::OR:
     case ISD::XOR:
     case ISD::UMAX:
-      return Const->isZero();
+      return Const.isZero();
     case ISD::MUL:
-      return Const->isOne();
+      return Const.isOne();
     case ISD::AND:
     case ISD::UMIN:
-      return Const->isAllOnes();
+      return Const.isAllOnes();
     case ISD::SMAX:
-      return Const->isMinSignedValue();
+      return Const.isMinSignedValue();
     case ISD::SMIN:
-      return Const->isMaxSignedValue();
+      return Const.isMaxSignedValue();
     case ISD::SUB:
     case ISD::SHL:
     case ISD::SRA:
     case ISD::SRL:
-      return OperandNo == 1 && Const->isZero();
+      return OperandNo == 1 && Const.isZero();
     case ISD::UDIV:
     case ISD::SDIV:
-      return OperandNo == 1 && Const->isOne();
+      return OperandNo == 1 && Const.isOne();
     }
   } else if (auto *ConstFP = isConstOrConstSplatFP(V)) {
     switch (Opcode) {
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index bafa92e06834..65d0768c6088 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -18,14 +18,12 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV32-NEXT:    vand.vv v8, v11, v8
+; RV32-NEXT:    vmerge.vvm v8, v8, v11, v0
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctz_nxv4i32:
@@ -41,14 +39,12 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64-NEXT:    vand.vv v8, v11, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v11, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    sub a0, a0, a1
-; RV64-NEXT:    lui a1, 16
-; RV64-NEXT:    addiw a1, a1, -1
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32> %a, i1 0)
   ret i32 %res
@@ -158,8 +154,7 @@ define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
 ; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vmadd.vx v16, a1, v8
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64-NEXT:    vand.vv v8, v16, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    subw a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll
new file mode 100644
index 000000000000..3a8d08f306a5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+; The following binop x, (zext i1) tests will be vector-legalized into a vselect
+; of two splat_vectors, but on RV64 the splat value will be implicitly
+; truncated:
+;
+;       t15: nxv2i32 = splat_vector Constant:i64<1>
+;       t13: nxv2i32 = splat_vector Constant:i64<0>
+;     t16: nxv2i32 = vselect t2, t15, t13
+;   t7: nxv2i32 = add t4, t16
+;
+; Make sure that foldSelectWithIdentityConstant in DAGCombiner.cpp handles the
+; truncating splat, so we pull the vselect back and fold it into a mask.
+
+define <vscale x 2 x i32> @i1_zext_add(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %add = add <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %add
+}
+
+define <vscale x 2 x i32> @i1_zext_add_commuted(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_add_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %add = add <vscale x 2 x i32> %zext, %b
+  ret <vscale x 2 x i32> %add
+}
+
+define <vscale x 2 x i32> @i1_zext_sub(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_sub:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %sub = sub <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %sub
+}
+
+define <vscale x 2 x i32> @i1_zext_or(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_or:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vor.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %or = or <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %or
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index e56dca0732bb..a14ce7172615 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -149,49 +149,49 @@ define <vscale x 2 x i64> @vwop_vscale_sext_i32i64_multiple_users(ptr %x, ptr %y
 }
 
 define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) {
-; RV32-LABEL: vwop_vscale_sext_i1i32_multiple_users:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
-; RV32-NEXT:    vlm.v v8, (a0)
-; RV32-NEXT:    vlm.v v9, (a1)
-; RV32-NEXT:    vlm.v v10, (a2)
-; RV32-NEXT:    vmv.v.i v11, 0
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vmerge.vim v12, v11, -1, v0
-; RV32-NEXT:    vmv.v.v v0, v9
-; RV32-NEXT:    vmerge.vim v9, v11, -1, v0
-; RV32-NEXT:    vmv.v.v v0, v10
-; RV32-NEXT:    vmerge.vim v10, v11, -1, v0
-; RV32-NEXT:    vmul.vv v9, v12, v9
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsub.vv v11, v12, v10
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vsub.vx v10, v10, a0, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v10
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    ret
+; NO_FOLDING-LABEL: vwop_vscale_sext_i1i32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; NO_FOLDING-NEXT:    vlm.v v8, (a0)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
+; NO_FOLDING-NEXT:    vlm.v v10, (a2)
+; NO_FOLDING-NEXT:    vmv.v.i v11, 0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v9
+; NO_FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v10
+; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
+; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT:    li a0, 1
+; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
+; NO_FOLDING-NEXT:    ret
 ;
-; RV64-LABEL: vwop_vscale_sext_i1i32_multiple_users:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vlm.v v8, (a0)
-; RV64-NEXT:    vlm.v v9, (a1)
-; RV64-NEXT:    vlm.v v10, (a2)
-; RV64-NEXT:    vmv.v.i v11, 0
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v12, v11, -1, v0
-; RV64-NEXT:    vmv.v.v v0, v9
-; RV64-NEXT:    vmerge.vim v9, v11, -1, v0
-; RV64-NEXT:    vmv.v.v v0, v10
-; RV64-NEXT:    vmerge.vim v10, v11, -1, v0
-; RV64-NEXT:    vmul.vv v9, v12, v9
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v11, 1, v0
-; RV64-NEXT:    vsub.vv v8, v10, v8
-; RV64-NEXT:    vsub.vv v10, v12, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    ret
+; FOLDING-LABEL: vwop_vscale_sext_i1i32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; FOLDING-NEXT:    vlm.v v8, (a0)
+; FOLDING-NEXT:    vlm.v v9, (a1)
+; FOLDING-NEXT:    vlm.v v10, (a2)
+; FOLDING-NEXT:    vmv.v.i v11, 0
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v9
+; FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v10
+; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
+; FOLDING-NEXT:    vmul.vv v9, v12, v9
+; FOLDING-NEXT:    li a0, 1
+; FOLDING-NEXT:    vsub.vv v11, v12, v10
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v11
+; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
   %b2 = load <vscale x 2 x i1>, ptr %z
@@ -209,7 +209,7 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
 ; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v8, (a0)
 ; NO_FOLDING-NEXT:    vlm.v v9, (a1)
 ; NO_FOLDING-NEXT:    vlm.v v10, (a2)
@@ -221,17 +221,17 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v10
 ; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT:    li a0, 1
+; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v11, 1, v0
-; NO_FOLDING-NEXT:    vsub.vv v8, v10, v8
-; NO_FOLDING-NEXT:    vsub.vv v10, v12, v10
-; NO_FOLDING-NEXT:    vor.vv v8, v9, v8
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v10
+; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v8, (a0)
 ; FOLDING-NEXT:    vlm.v v9, (a1)
 ; FOLDING-NEXT:    vlm.v v10, (a2)
@@ -243,12 +243,12 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING-NEXT:    vmv1r.v v0, v10
 ; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; FOLDING-NEXT:    vmul.vv v9, v12, v9
+; FOLDING-NEXT:    li a0, 1
+; FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v11, 1, v0
-; FOLDING-NEXT:    vsub.vv v8, v10, v8
-; FOLDING-NEXT:    vsub.vv v10, v12, v10
-; FOLDING-NEXT:    vor.vv v8, v9, v8
-; FOLDING-NEXT:    vor.vv v8, v8, v10
+; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v11
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -444,41 +444,39 @@ define <vscale x 2 x i64> @vwop_vscale_zext_i32i64_multiple_users(ptr %x, ptr %y
 }
 
 define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) {
-; RV32-LABEL: vwop_vscale_zext_i1i32_multiple_users:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
-; RV32-NEXT:    vlm.v v0, (a0)
-; RV32-NEXT:    vlm.v v8, (a2)
-; RV32-NEXT:    vlm.v v9, (a1)
-; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    vmerge.vim v11, v10, 1, v0
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v10, 1, v0
-; RV32-NEXT:    vadd.vv v10, v11, v8
-; RV32-NEXT:    vsub.vv v8, v11, v8
-; RV32-NEXT:    vmv.v.v v0, v9
-; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    ret
+; NO_FOLDING-LABEL: vwop_vscale_zext_i1i32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; NO_FOLDING-NEXT:    vlm.v v0, (a0)
+; NO_FOLDING-NEXT:    vlm.v v8, (a2)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
+; NO_FOLDING-NEXT:    vmv.v.i v10, 0
+; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
+; NO_FOLDING-NEXT:    vmv.v.v v0, v9
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
+; NO_FOLDING-NEXT:    ret
 ;
-; RV64-LABEL: vwop_vscale_zext_i1i32_multiple_users:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vlm.v v0, (a0)
-; RV64-NEXT:    vlm.v v8, (a1)
-; RV64-NEXT:    vlm.v v9, (a2)
-; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    vmerge.vim v11, v10, 1, v0
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v10, 1, v0
-; RV64-NEXT:    vmv.v.v v0, v9
-; RV64-NEXT:    vmerge.vim v9, v10, 1, v0
-; RV64-NEXT:    vmul.vv v8, v11, v8
-; RV64-NEXT:    vadd.vv v10, v11, v9
-; RV64-NEXT:    vsub.vv v9, v11, v9
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    ret
+; FOLDING-LABEL: vwop_vscale_zext_i1i32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; FOLDING-NEXT:    vlm.v v0, (a0)
+; FOLDING-NEXT:    vlm.v v8, (a2)
+; FOLDING-NEXT:    vlm.v v9, (a1)
+; FOLDING-NEXT:    vmv.v.i v10, 0
+; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v11, v8
+; FOLDING-NEXT:    vsub.vv v8, v11, v8
+; FOLDING-NEXT:    vmv.v.v v0, v9
+; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v8
+; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
   %b2 = load <vscale x 2 x i1>, ptr %z
@@ -496,40 +494,36 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
 ; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a1)
-; NO_FOLDING-NEXT:    vlm.v v9, (a2)
+; NO_FOLDING-NEXT:    vlm.v v8, (a2)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
 ; NO_FOLDING-NEXT:    vmv.v.i v10, 0
 ; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
 ; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v9
-; NO_FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
-; NO_FOLDING-NEXT:    vmul.vv v8, v11, v8
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v9
-; NO_FOLDING-NEXT:    vsub.vv v9, v11, v9
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v10
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v9
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a1)
-; FOLDING-NEXT:    vlm.v v9, (a2)
+; FOLDING-NEXT:    vlm.v v8, (a2)
+; FOLDING-NEXT:    vlm.v v9, (a1)
 ; FOLDING-NEXT:    vmv.v.i v10, 0
 ; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
 ; FOLDING-NEXT:    vmv1r.v v0, v8
 ; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v11, v8
+; FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; FOLDING-NEXT:    vmv1r.v v0, v9
-; FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
-; FOLDING-NEXT:    vmul.vv v8, v11, v8
-; FOLDING-NEXT:    vadd.vv v10, v11, v9
-; FOLDING-NEXT:    vsub.vv v9, v11, v9
-; FOLDING-NEXT:    vor.vv v8, v8, v10
-; FOLDING-NEXT:    vor.vv v8, v8, v9
+; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v8
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -594,3 +588,6 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y,
 
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
-- 
cgit v1.2.3


From d89914f30bc7c180fe349a5aa0f03438ae6c20a4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 3 Apr 2024 21:48:38 -0700
Subject: [memprof] Add Version2 of IndexedMemProfRecord serialization (#87455)

I'm currently developing a new version of the indexed memprof format
where we deduplicate call stacks in IndexedAllocationInfo::CallStack
and IndexedMemProfRecord::CallSites.  We refer to call stacks with
integer IDs, namely CallStackId, just as we refer to Frame with
FrameId.  The deduplication will cut down the profile file size by 80%
in a large memprof file of mine.

As a step toward the goal, this patch teaches
IndexedMemProfRecord::{serialize,deserialize} to speak Version2.  A
subsequent patch will add Version2 support to llvm-profdata.

The essense of the patch is to replace the serialization of a call
stack, a vector of FrameIDs, with that of a CallStackId.  That is:

  const IndexedAllocationInfo &N = ...;
  ...
  LE.write<uint64_t>(N.CallStack.size());
  for (const FrameId &Id : N.CallStack)
    LE.write<FrameId>(Id);

becomes:

  LE.write<CallStackId>(N.CSId);
---
 llvm/include/llvm/ProfileData/MemProf.h    |  61 ++++-------
 llvm/lib/ProfileData/InstrProfReader.cpp   |   2 +-
 llvm/lib/ProfileData/InstrProfWriter.cpp   |   6 +-
 llvm/lib/ProfileData/MemProf.cpp           | 170 +++++++++++++++++++++++++++--
 llvm/unittests/ProfileData/MemProfTest.cpp |  41 ++++++-
 5 files changed, 226 insertions(+), 54 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index ff00900a1466..110e69770264 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -22,6 +22,8 @@ enum IndexedVersion : uint64_t {
   Version0 = 0,
   // Version 1: Added a version field to the header.
   Version1 = 1,
+  // Version 2: Added a call stack table.  Under development.
+  Version2 = 2,
 };
 
 constexpr uint64_t MinimumSupportedVersion = Version0;
@@ -289,23 +291,14 @@ struct IndexedAllocationInfo {
       : CallStack(CS.begin(), CS.end()), CSId(CSId), Info(MB) {}
 
   // Returns the size in bytes when this allocation info struct is serialized.
-  size_t serializedSize() const {
-    return sizeof(uint64_t) + // The number of frames to serialize.
-           sizeof(FrameId) * CallStack.size() +    // The callstack frame ids.
-           PortableMemInfoBlock::serializedSize(); // The size of the payload.
-  }
+  size_t serializedSize(IndexedVersion Version) const;
 
   bool operator==(const IndexedAllocationInfo &Other) const {
     if (Other.Info != Info)
       return false;
 
-    if (Other.CallStack.size() != CallStack.size())
+    if (Other.CSId != CSId)
       return false;
-
-    for (size_t J = 0; J < Other.CallStack.size(); J++) {
-      if (Other.CallStack[J] != CallStack[J])
-        return false;
-    }
     return true;
   }
 
@@ -357,6 +350,9 @@ struct IndexedMemProfRecord {
   // inline location list may include additional entries, users should pick
   // the last entry in the list with the same function GUID.
   llvm::SmallVector<llvm::SmallVector<FrameId>> CallSites;
+  // Conceptually the same as above.  We are going to keep both CallSites and
+  // CallSiteIds while we are transitioning from CallSites to CallSiteIds.
+  llvm::SmallVector<CallStackId> CallSiteIds;
 
   void clear() {
     AllocSites.clear();
@@ -370,47 +366,31 @@ struct IndexedMemProfRecord {
     CallSites.append(Other.CallSites);
   }
 
-  size_t serializedSize() const {
-    size_t Result = sizeof(GlobalValue::GUID);
-    for (const IndexedAllocationInfo &N : AllocSites)
-      Result += N.serializedSize();
-
-    // The number of callsites we have information for.
-    Result += sizeof(uint64_t);
-    for (const auto &Frames : CallSites) {
-      // The number of frame ids to serialize.
-      Result += sizeof(uint64_t);
-      Result += Frames.size() * sizeof(FrameId);
-    }
-    return Result;
-  }
+  size_t serializedSize(IndexedVersion Version) const;
 
   bool operator==(const IndexedMemProfRecord &Other) const {
     if (Other.AllocSites.size() != AllocSites.size())
       return false;
 
-    if (Other.CallSites.size() != CallSites.size())
-      return false;
-
     for (size_t I = 0; I < AllocSites.size(); I++) {
       if (AllocSites[I] != Other.AllocSites[I])
         return false;
     }
 
-    for (size_t I = 0; I < CallSites.size(); I++) {
-      if (CallSites[I] != Other.CallSites[I])
-        return false;
-    }
+    if (Other.CallSiteIds != CallSiteIds)
+      return false;
     return true;
   }
 
   // Serializes the memprof records in \p Records to the ostream \p OS based
   // on the schema provided in \p Schema.
-  void serialize(const MemProfSchema &Schema, raw_ostream &OS);
+  void serialize(const MemProfSchema &Schema, raw_ostream &OS,
+                 IndexedVersion Version);
 
   // Deserializes memprof records from the Buffer.
   static IndexedMemProfRecord deserialize(const MemProfSchema &Schema,
-                                          const unsigned char *Buffer);
+                                          const unsigned char *Buffer,
+                                          IndexedVersion Version);
 
   // Returns the GUID for the function name after canonicalization. For
   // memprof, we remove any .llvm suffix added by LTO. MemProfRecords are
@@ -480,7 +460,8 @@ public:
   using offset_type = uint64_t;
 
   RecordLookupTrait() = delete;
-  RecordLookupTrait(const MemProfSchema &S) : Schema(S) {}
+  RecordLookupTrait(IndexedVersion V, const MemProfSchema &S)
+      : Version(V), Schema(S) {}
 
   static bool EqualKey(uint64_t A, uint64_t B) { return A == B; }
   static uint64_t GetInternalKey(uint64_t K) { return K; }
@@ -507,11 +488,13 @@ public:
 
   data_type ReadData(uint64_t K, const unsigned char *D,
                      offset_type /*Unused*/) {
-    Record = IndexedMemProfRecord::deserialize(Schema, D);
+    Record = IndexedMemProfRecord::deserialize(Schema, D, Version);
     return Record;
   }
 
 private:
+  // Holds the MemProf version.
+  IndexedVersion Version;
   // Holds the memprof schema used to deserialize records.
   MemProfSchema Schema;
   // Holds the records from one function deserialized from the indexed format.
@@ -519,7 +502,7 @@ private:
 };
 
 // Trait for writing IndexedMemProfRecord data to the on-disk hash table.
-class RecordWriterTrait {
+template <IndexedVersion Version> class RecordWriterTrait {
 public:
   using key_type = uint64_t;
   using key_type_ref = uint64_t;
@@ -546,7 +529,7 @@ public:
     endian::Writer LE(Out, llvm::endianness::little);
     offset_type N = sizeof(K);
     LE.write<offset_type>(N);
-    offset_type M = V.serializedSize();
+    offset_type M = V.serializedSize(Version);
     LE.write<offset_type>(M);
     return std::make_pair(N, M);
   }
@@ -560,7 +543,7 @@ public:
   void EmitData(raw_ostream &Out, key_type_ref /*Unused*/, data_type_ref V,
                 offset_type /*Unused*/) {
     assert(Schema != nullptr && "MemProf schema is not initialized!");
-    V.serialize(*Schema, Out);
+    V.serialize(*Schema, Out, Version);
     // Clear the IndexedMemProfRecord which results in clearing/freeing its
     // vectors of allocs and callsites. This is owned by the associated on-disk
     // hash table, but unused after this point. See also the comment added to
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 7ac5c561dc08..884334ed070e 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1303,7 +1303,7 @@ Error IndexedInstrProfReader::readHeader() {
     MemProfRecordTable.reset(MemProfRecordHashTable::Create(
         /*Buckets=*/Start + RecordTableOffset,
         /*Payload=*/Ptr,
-        /*Base=*/Start, memprof::RecordLookupTrait(Schema)));
+        /*Base=*/Start, memprof::RecordLookupTrait(memprof::Version1, Schema)));
 
     // Initialize the frame table reader with the payload and bucket offsets.
     MemProfFrameTable.reset(MemProfFrameHashTable::Create(
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index c2c94ba30c65..a1bc180a53ca 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -557,9 +557,11 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       OS.write(static_cast<uint64_t>(Id));
     }
 
-    auto RecordWriter = std::make_unique<memprof::RecordWriterTrait>();
+    auto RecordWriter =
+        std::make_unique<memprof::RecordWriterTrait<memprof::Version1>>();
     RecordWriter->Schema = &Schema;
-    OnDiskChainedHashTableGenerator<memprof::RecordWriterTrait>
+    OnDiskChainedHashTableGenerator<
+        memprof::RecordWriterTrait<memprof::Version1>>
         RecordTableGenerator;
     for (auto &I : MemProfRecordData) {
       // Insert the key (func hash) and value (memprof record).
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 6c419811d59e..ac0a8702c3f9 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -10,15 +10,88 @@
 
 namespace llvm {
 namespace memprof {
+namespace {
+size_t serializedSizeV0(const IndexedAllocationInfo &IAI) {
+  size_t Size = 0;
+  // The number of frames to serialize.
+  Size += sizeof(uint64_t);
+  // The callstack frame ids.
+  Size += sizeof(FrameId) * IAI.CallStack.size();
+  // The size of the payload.
+  Size += PortableMemInfoBlock::serializedSize();
+  return Size;
+}
 
-void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
-                                     raw_ostream &OS) {
+size_t serializedSizeV2(const IndexedAllocationInfo &IAI) {
+  size_t Size = 0;
+  // The CallStackId
+  Size += sizeof(CallStackId);
+  // The size of the payload.
+  Size += PortableMemInfoBlock::serializedSize();
+  return Size;
+}
+} // namespace
+
+size_t IndexedAllocationInfo::serializedSize(IndexedVersion Version) const {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    return serializedSizeV0(*this);
+  case Version2:
+    return serializedSizeV2(*this);
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
+namespace {
+size_t serializedSizeV0(const IndexedMemProfRecord &Record) {
+  size_t Result = sizeof(GlobalValue::GUID);
+  for (const IndexedAllocationInfo &N : Record.AllocSites)
+    Result += N.serializedSize(Version0);
+
+  // The number of callsites we have information for.
+  Result += sizeof(uint64_t);
+  for (const auto &Frames : Record.CallSites) {
+    // The number of frame ids to serialize.
+    Result += sizeof(uint64_t);
+    Result += Frames.size() * sizeof(FrameId);
+  }
+  return Result;
+}
+
+size_t serializedSizeV2(const IndexedMemProfRecord &Record) {
+  size_t Result = sizeof(GlobalValue::GUID);
+  for (const IndexedAllocationInfo &N : Record.AllocSites)
+    Result += N.serializedSize(Version2);
+
+  // The number of callsites we have information for.
+  Result += sizeof(uint64_t);
+  // The CallStackId
+  Result += Record.CallSiteIds.size() * sizeof(CallStackId);
+  return Result;
+}
+} // namespace
+
+size_t IndexedMemProfRecord::serializedSize(IndexedVersion Version) const {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    return serializedSizeV0(*this);
+  case Version2:
+    return serializedSizeV2(*this);
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
+namespace {
+void serializeV0(const IndexedMemProfRecord &Record,
+                 const MemProfSchema &Schema, raw_ostream &OS) {
   using namespace support;
 
   endian::Writer LE(OS, llvm::endianness::little);
 
-  LE.write<uint64_t>(AllocSites.size());
-  for (const IndexedAllocationInfo &N : AllocSites) {
+  LE.write<uint64_t>(Record.AllocSites.size());
+  for (const IndexedAllocationInfo &N : Record.AllocSites) {
     LE.write<uint64_t>(N.CallStack.size());
     for (const FrameId &Id : N.CallStack)
       LE.write<FrameId>(Id);
@@ -26,17 +99,50 @@ void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
   }
 
   // Related contexts.
-  LE.write<uint64_t>(CallSites.size());
-  for (const auto &Frames : CallSites) {
+  LE.write<uint64_t>(Record.CallSites.size());
+  for (const auto &Frames : Record.CallSites) {
     LE.write<uint64_t>(Frames.size());
     for (const FrameId &Id : Frames)
       LE.write<FrameId>(Id);
   }
 }
 
-IndexedMemProfRecord
-IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
-                                  const unsigned char *Ptr) {
+void serializeV2(const IndexedMemProfRecord &Record,
+                 const MemProfSchema &Schema, raw_ostream &OS) {
+  using namespace support;
+
+  endian::Writer LE(OS, llvm::endianness::little);
+
+  LE.write<uint64_t>(Record.AllocSites.size());
+  for (const IndexedAllocationInfo &N : Record.AllocSites) {
+    LE.write<CallStackId>(N.CSId);
+    N.Info.serialize(Schema, OS);
+  }
+
+  // Related contexts.
+  LE.write<uint64_t>(Record.CallSiteIds.size());
+  for (const auto &CSId : Record.CallSiteIds)
+    LE.write<CallStackId>(CSId);
+}
+} // namespace
+
+void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
+                                     raw_ostream &OS, IndexedVersion Version) {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    serializeV0(*this, Schema, OS);
+    return;
+  case Version2:
+    serializeV2(*this, Schema, OS);
+    return;
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
+namespace {
+IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema,
+                                   const unsigned char *Ptr) {
   using namespace support;
 
   IndexedMemProfRecord Record;
@@ -73,11 +179,57 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
       Frames.push_back(Id);
     }
     Record.CallSites.push_back(Frames);
+    Record.CallSiteIds.push_back(hashCallStack(Frames));
   }
 
   return Record;
 }
 
+IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
+                                   const unsigned char *Ptr) {
+  using namespace support;
+
+  IndexedMemProfRecord Record;
+
+  // Read the meminfo nodes.
+  const uint64_t NumNodes =
+      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+  for (uint64_t I = 0; I < NumNodes; I++) {
+    IndexedAllocationInfo Node;
+    Node.CSId =
+        endian::readNext<CallStackId, llvm::endianness::little, unaligned>(Ptr);
+    Node.Info.deserialize(Schema, Ptr);
+    Ptr += PortableMemInfoBlock::serializedSize();
+    Record.AllocSites.push_back(Node);
+  }
+
+  // Read the callsite information.
+  const uint64_t NumCtxs =
+      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+  for (uint64_t J = 0; J < NumCtxs; J++) {
+    CallStackId CSId =
+        endian::readNext<CallStackId, llvm::endianness::little, unaligned>(Ptr);
+    Record.CallSiteIds.push_back(CSId);
+  }
+
+  return Record;
+}
+} // namespace
+
+IndexedMemProfRecord
+IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
+                                  const unsigned char *Ptr,
+                                  IndexedVersion Version) {
+  switch (Version) {
+  case Version0:
+  case Version1:
+    return deserializeV0(Schema, Ptr);
+  case Version2:
+    return deserializeV2(Schema, Ptr);
+  }
+  llvm_unreachable("unsupported MemProf version");
+}
+
 GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) {
   // Canonicalize the function name to drop suffixes such as ".llvm.". Note
   // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 1cca44e9b037..f1aa6f37aa39 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -265,7 +265,9 @@ TEST(MemProf, PortableWrapper) {
   EXPECT_EQ(3UL, ReadBlock.getAllocCpuId());
 }
 
-TEST(MemProf, RecordSerializationRoundTrip) {
+// Version0 and Version1 serialize IndexedMemProfRecord in the same format, so
+// we share one test.
+TEST(MemProf, RecordSerializationRoundTripVersion0And1) {
   const MemProfSchema Schema = getFullSchema();
 
   MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
@@ -284,14 +286,47 @@ TEST(MemProf, RecordSerializationRoundTrip) {
                                    Info);
   }
   Record.CallSites.assign(CallSites);
+  for (const auto &CS : CallSites)
+    Record.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS));
 
   std::string Buffer;
   llvm::raw_string_ostream OS(Buffer);
-  Record.serialize(Schema, OS);
+  Record.serialize(Schema, OS, llvm::memprof::Version0);
   OS.flush();
 
   const IndexedMemProfRecord GotRecord = IndexedMemProfRecord::deserialize(
-      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()));
+      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()),
+      llvm::memprof::Version0);
+
+  EXPECT_EQ(Record, GotRecord);
+}
+
+TEST(MemProf, RecordSerializationRoundTripVerion2) {
+  const MemProfSchema Schema = getFullSchema();
+
+  MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
+                    /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
+                    /*dealloc_cpu=*/4);
+
+  llvm::SmallVector<llvm::memprof::CallStackId> CallStackIds = {0x123, 0x456};
+
+  llvm::SmallVector<llvm::memprof::CallStackId> CallSiteIds = {0x333, 0x444};
+
+  IndexedMemProfRecord Record;
+  for (const auto &CSId : CallStackIds) {
+    // Use the same info block for both allocation sites.
+    Record.AllocSites.emplace_back(llvm::SmallVector<FrameId>(), CSId, Info);
+  }
+  Record.CallSiteIds.assign(CallSiteIds);
+
+  std::string Buffer;
+  llvm::raw_string_ostream OS(Buffer);
+  Record.serialize(Schema, OS, llvm::memprof::Version2);
+  OS.flush();
+
+  const IndexedMemProfRecord GotRecord = IndexedMemProfRecord::deserialize(
+      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()),
+      llvm::memprof::Version2);
 
   EXPECT_EQ(Record, GotRecord);
 }
-- 
cgit v1.2.3


From 4f19f15a601a5761b12c9c66d99d97dbc89ef90d Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev@accesssoftek.com>
Date: Thu, 4 Apr 2024 08:20:13 +0300
Subject: [PAC][llvm-readobj][AArch64][ELF] Support
 `GNU_PROPERTY_AARCH64_FEATURE_PAUTH` (#87545)

Reland #85231 after fixing build failure
https://lab.llvm.org/buildbot/#/builders/186/builds/15631.
Use `PRIx64` for format output of `uint64_t` as hex.
Original PR description below.

This adds support for `GNU_PROPERTY_AARCH64_FEATURE_PAUTH` feature (as
defined in https://github.com/ARM-software/abi-aa/pull/240) handling in
llvm-readobj and llvm-readelf. The following constants for supported
platforms are also introduced:

- `AARCH64_PAUTH_PLATFORM_INVALID = 0x0`
- `AARCH64_PAUTH_PLATFORM_BAREMETAL = 0x1`
- `AARCH64_PAUTH_PLATFORM_LLVM_LINUX = 0x10000002`

For the llvm_linux platform, output of the tools contains descriptions
of PAuth features which are enabled/disabled depending on the version
value. Version value bits correspond to the following `LangOptions`
defined in #85232:

- bit 0: `PointerAuthIntrinsics`;
- bit 1: `PointerAuthCalls`;
- bit 2: `PointerAuthReturns`;
- bit 3: `PointerAuthAuthTraps`;
- bit 4: `PointerAuthVTPtrAddressDiscrimination`;
- bit 5: `PointerAuthVTPtrTypeDiscrimination`;
- bit 6: `PointerAuthInitFini`.

Support for `.note.AARCH64-PAUTH-ABI-tag` is dropped since it's deleted
from the spec in ARM-software/abi-aa#250.
---
 llvm/include/llvm/BinaryFormat/ELF.h               |  26 +-
 .../ELF/AArch64/aarch64-feature-pauth.s            | 298 ++++++++++++++-------
 .../ELF/AArch64/aarch64-note-gnu-property.s        |   2 +
 llvm/tools/llvm-readobj/ELFDumper.cpp              | 127 +++++----
 4 files changed, 296 insertions(+), 157 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 877f3f7862c8..ed267c100b76 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1712,11 +1712,6 @@ enum {
   NT_ANDROID_TYPE_MEMTAG = 4,
 };
 
-// ARM note types.
-enum {
-  NT_ARM_TYPE_PAUTH_ABI_TAG = 1,
-};
-
 // Memory tagging values used in NT_ANDROID_TYPE_MEMTAG notes.
 enum {
   // Enumeration to determine the tagging mode. In Android-land, 'SYNC' means
@@ -1740,6 +1735,7 @@ enum : unsigned {
   GNU_PROPERTY_STACK_SIZE = 1,
   GNU_PROPERTY_NO_COPY_ON_PROTECTED = 2,
   GNU_PROPERTY_AARCH64_FEATURE_1_AND = 0xc0000000,
+  GNU_PROPERTY_AARCH64_FEATURE_PAUTH = 0xc0000001,
   GNU_PROPERTY_X86_FEATURE_1_AND = 0xc0000002,
 
   GNU_PROPERTY_X86_UINT32_OR_LO = 0xc0008000,
@@ -1758,6 +1754,26 @@ enum : unsigned {
   GNU_PROPERTY_AARCH64_FEATURE_1_GCS = 1 << 2,
 };
 
+// aarch64 PAuth platforms.
+enum : unsigned {
+  AARCH64_PAUTH_PLATFORM_INVALID = 0x0,
+  AARCH64_PAUTH_PLATFORM_BAREMETAL = 0x1,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX = 0x10000002,
+};
+
+// Bit positions of version flags for AARCH64_PAUTH_PLATFORM_LLVM_LINUX.
+enum : unsigned {
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INTRINSICS = 0,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_CALLS = 1,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_RETURNS = 2,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_AUTHTRAPS = 3,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRADDRDISCR = 4,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRTYPEDISCR = 5,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI = 6,
+  AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST =
+      AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI,
+};
+
 // x86 processor feature bits.
 enum : unsigned {
   GNU_PROPERTY_X86_FEATURE_1_IBT = 1 << 0,
diff --git a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
index f28d92eae857..512531748cd2 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
+++ b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
@@ -1,98 +1,204 @@
 # RUN: rm -rf %t && split-file %s %t && cd %t
 
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag.s       -o tag.o
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-short.s -o tag-short.o
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-long.s  -o tag-long.o
-
-# RUN: llvm-readelf --notes tag.o       | FileCheck --check-prefix NORMAL %s
-# RUN: llvm-readelf --notes tag-short.o | FileCheck --check-prefix SHORT  %s
-# RUN: llvm-readelf --notes tag-long.o  | FileCheck --check-prefix LONG   %s
-
-# NORMAL: AArch64 PAuth ABI tag: platform 0x2a, version 0x1
-# SHORT:  AArch64 PAuth ABI tag: <corrupted size: expected at least 16, got 12>
-# LONG:   AArch64 PAuth ABI tag: platform 0x2a, version 0x1, additional info 0xEFCDAB8967452301
-
-# RUN: llvm-readobj --notes tag.o       | FileCheck --check-prefix LLVM-NORMAL %s
-# RUN: llvm-readobj --notes tag-short.o | FileCheck --check-prefix LLVM-SHORT %s
-# RUN: llvm-readobj --notes tag-long.o  | FileCheck --check-prefix LLVM-LONG %s
-
-# LLVM-SHORT:      Notes [
-# LLVM-SHORT-NEXT:   NoteSection {
-# LLVM-SHORT-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-SHORT-NEXT:     Offset: 0x40
-# LLVM-SHORT-NEXT:     Size: 0x1C
-# LLVM-SHORT-NEXT:     Note {
-# LLVM-SHORT-NEXT:       Owner: ARM
-# LLVM-SHORT-NEXT:       Data size: 0xC
-# LLVM-SHORT-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-SHORT-NEXT:       Description data (
-# LLVM-SHORT-NEXT:         0000: 2A000000 00000000 01000000
-# LLVM-SHORT-NEXT:       )
-# LLVM-SHORT-NEXT:     }
-# LLVM-SHORT-NEXT:   }
-# LLVM-SHORT-NEXT: ]
-
-# LLVM-NORMAL:      Notes [
-# LLVM-NORMAL-NEXT:   NoteSection {
-# LLVM-NORMAL-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-NORMAL-NEXT:     Offset: 0x40
-# LLVM-NORMAL-NEXT:     Size: 0x20
-# LLVM-NORMAL-NEXT:     Note {
-# LLVM-NORMAL-NEXT:       Owner: ARM
-# LLVM-NORMAL-NEXT:       Data size: 0x10
-# LLVM-NORMAL-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-NORMAL-NEXT:       Platform: 42
-# LLVM-NORMAL-NEXT:       Version: 1
-# LLVM-NORMAL-NEXT:     }
-# LLVM-NORMAL-NEXT:   }
-# LLVM-NORMAL-NEXT: ]
-
-# LLVM-LONG:      Notes [
-# LLVM-LONG-NEXT:   NoteSection {
-# LLVM-LONG-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-LONG-NEXT:     Offset: 0x40
-# LLVM-LONG-NEXT:     Size: 0x28
-# LLVM-LONG-NEXT:     Note {
-# LLVM-LONG-NEXT:       Owner: ARM
-# LLVM-LONG-NEXT:       Data size: 0x18
-# LLVM-LONG-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-LONG-NEXT:       Platform: 42
-# LLVM-LONG-NEXT:       Version: 1
-# LLVM-LONG-NEXT:       Additional info: EFCDAB8967452301
-# LLVM-LONG-NEXT:     }
-# LLVM-LONG-NEXT:   }
-# LLVM-LONG-NEXT: ]
-
-#--- abi-tag.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 16
-.long 1
-.asciz "ARM"
-
-.quad 42         // platform
-.quad 1          // version
-
-#--- abi-tag-short.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 12
-.long 1
-.asciz "ARM"
-
-.quad 42
-.word 1
-
-#--- abi-tag-long.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 24
-.long 1
-.asciz "ARM"
-
-.quad 42         // platform
-.quad 1          // version
-.quad 0x0123456789ABCDEF // extra data
+#--- gnu-42-1.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 42          // PAuth ABI platform
+  .quad 1           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-42-1.s -o gnu-42-1.o
+# RUN: llvm-readelf --notes gnu-42-1.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x2a (unknown)" -DVERSION=0x1 %s
+# RUN: llvm-readobj --notes gnu-42-1.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x2a (unknown)" -DVERSION=0x1 %s
+
+# ELF: Displaying notes found in: .note.gnu.property
+# ELF-NEXT:   Owner                 Data size	Description
+# ELF-NEXT:   GNU                   0x00000018	NT_GNU_PROPERTY_TYPE_0 (property note)
+# ELF-NEXT:   AArch64 PAuth ABI core info: platform [[PLATFORM]], version [[VERSION]]
+
+# OBJ:      Notes [
+# OBJ-NEXT:   NoteSection {
+# OBJ-NEXT:     Name: .note.gnu.property
+# OBJ-NEXT:     Offset: 0x40
+# OBJ-NEXT:     Size: 0x28
+# OBJ-NEXT:     Note {
+# OBJ-NEXT:       Owner: GNU
+# OBJ-NEXT:       Data size: 0x18
+# OBJ-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+# OBJ-NEXT:       Property [
+# OBJ-NEXT:         AArch64 PAuth ABI core info: platform [[PLATFORM]], version [[VERSION]]
+# OBJ-NEXT:       ]
+# OBJ-NEXT:     }
+# OBJ-NEXT:   }
+# OBJ-NEXT: ]
+
+#--- gnu-0-0.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0           // PAuth ABI platform
+  .quad 0           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0-0.s -o gnu-0-0.o
+# RUN: llvm-readelf --notes gnu-0-0.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x0 (invalid)" -DVERSION=0x0 %s
+# RUN: llvm-readobj --notes gnu-0-0.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x0 (invalid)" -DVERSION=0x0 %s
+
+#--- gnu-1-0.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 1           // PAuth ABI platform
+  .quad 0           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-1-0.s -o gnu-1-0.o
+# RUN: llvm-readelf --notes gnu-1-0.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x1 (baremetal)" -DVERSION=0x0 %s
+# RUN: llvm-readobj --notes gnu-1-0.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x1 (baremetal)" -DVERSION=0x0 %s
+
+#--- gnu-0x10000002-85.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0x10000002  // PAuth ABI platform
+  .quad 85          // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0x10000002-85.s -o gnu-0x10000002-85.o
+# RUN: llvm-readelf --notes gnu-0x10000002-85.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x10000002 (llvm_linux)" \
+# RUN:   -DVERSION="0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)" %s
+# RUN: llvm-readobj --notes gnu-0x10000002-85.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x10000002 (llvm_linux)" \
+# RUN:   -DVERSION="0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)" %s
+
+#--- gnu-0x10000002-128.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0x10000002  // PAuth ABI platform
+  .quad 128         // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0x10000002-128.s -o gnu-0x10000002-128.o
+# RUN: llvm-readelf --notes gnu-0x10000002-128.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x10000002 (llvm_linux)" -DVERSION="0x80 (unknown)" %s
+# RUN: llvm-readobj --notes gnu-0x10000002-128.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x10000002 (llvm_linux)" -DVERSION="0x80 (unknown)" %s
+
+#--- gnu-short.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 12          // Data size
+  .quad 42          // PAuth ABI platform
+  .word 1           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-short.s -o gnu-short.o
+# RUN: llvm-readelf --notes gnu-short.o | \
+# RUN:   FileCheck --check-prefix=ELF-ERR -DSIZE=28 -DDATASIZE=18 \
+# RUN:   -DERR="<corrupted size: expected 16, got 12>" %s
+# RUN: llvm-readobj --notes gnu-short.o | \
+# RUN:   FileCheck --check-prefix=OBJ-ERR -DSIZE=28 -DDATASIZE=18 \
+# RUN:   -DERR="<corrupted size: expected 16, got 12>" %s
+
+# ELF-ERR: Displaying notes found in: .note.gnu.property
+# ELF-ERR-NEXT:   Owner                 Data size	Description
+# ELF-ERR-NEXT:   GNU                   0x000000[[DATASIZE]]	NT_GNU_PROPERTY_TYPE_0 (property note)
+# ELF-ERR-NEXT:   AArch64 PAuth ABI core info: [[ERR]]
+
+# OBJ-ERR:      Notes [
+# OBJ-ERR-NEXT:   NoteSection {
+# OBJ-ERR-NEXT:     Name: .note.gnu.property
+# OBJ-ERR-NEXT:     Offset: 0x40
+# OBJ-ERR-NEXT:     Size: 0x[[SIZE]]
+# OBJ-ERR-NEXT:     Note {
+# OBJ-ERR-NEXT:       Owner: GNU
+# OBJ-ERR-NEXT:       Data size: 0x[[DATASIZE]]
+# OBJ-ERR-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+# OBJ-ERR-NEXT:       Property [
+# OBJ-ERR-NEXT:         AArch64 PAuth ABI core info: [[ERR]]
+# OBJ-ERR-NEXT:       ]
+# OBJ-ERR-NEXT:     }
+# OBJ-ERR-NEXT:   }
+# OBJ-ERR-NEXT: ]
+
+#--- gnu-long.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 24          // Data size
+  .quad 42          // PAuth ABI platform
+  .quad 1           // PAuth ABI version
+  .quad 0x0123456789ABCDEF
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-long.s -o gnu-long.o
+# RUN: llvm-readelf --notes gnu-long.o | \
+# RUN:   FileCheck --check-prefix=ELF-ERR -DSIZE=30 -DDATASIZE=20 \
+# RUN:   -DERR="<corrupted size: expected 16, got 24>" %s
+# RUN: llvm-readobj --notes gnu-long.o | \
+# RUN:   FileCheck --check-prefix=OBJ-ERR -DSIZE=30 -DDATASIZE=20 \
+# RUN:   -DERR="<corrupted size: expected 16, got 24>" %s
diff --git a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
index 377e6f93448c..b517f0b38155 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
+++ b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
@@ -1,3 +1,5 @@
+// See tests for GNU_PROPERTY_AARCH64_FEATURE_PAUTH in aarch64-feature-pauth.s
+
 // RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu %s -o %t
 // RUN: llvm-readelf --notes %t | FileCheck %s --check-prefix=GNU
 // RUN: llvm-readobj --notes %t | FileCheck %s --check-prefix=LLVM
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 4b406ef12aec..d6dda6198017 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -61,6 +61,7 @@
 #include "llvm/Support/SystemZ/zOSSupport.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <array>
 #include <cinttypes>
 #include <cstddef>
 #include <cstdint>
@@ -5105,6 +5106,73 @@ template <class ELFT> void GNUELFDumper<ELFT>::printAddrsig() {
   }
 }
 
+template <class ELFT>
+static bool printAArch64PAuthABICoreInfo(raw_ostream &OS, uint32_t DataSize,
+                                         ArrayRef<uint8_t> Desc) {
+  OS << "    AArch64 PAuth ABI core info: ";
+  // DataSize - size without padding, Desc.size() - size with padding
+  if (DataSize != 16) {
+    OS << format("<corrupted size: expected 16, got %d>", DataSize);
+    return false;
+  }
+
+  uint64_t Platform =
+      support::endian::read64<ELFT::Endianness>(Desc.data() + 0);
+  uint64_t Version = support::endian::read64<ELFT::Endianness>(Desc.data() + 8);
+
+  const char *PlatformDesc = [Platform]() {
+    switch (Platform) {
+    case AARCH64_PAUTH_PLATFORM_INVALID:
+      return "invalid";
+    case AARCH64_PAUTH_PLATFORM_BAREMETAL:
+      return "baremetal";
+    case AARCH64_PAUTH_PLATFORM_LLVM_LINUX:
+      return "llvm_linux";
+    default:
+      return "unknown";
+    }
+  }();
+
+  std::string VersionDesc = [Platform, Version]() -> std::string {
+    if (Platform != AARCH64_PAUTH_PLATFORM_LLVM_LINUX)
+      return "";
+    if (Version >= (1 << (AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST + 1)))
+      return "unknown";
+
+    std::array<StringRef, AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST + 1>
+        Flags;
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INTRINSICS] = "Intrinsics";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_CALLS] = "Calls";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_RETURNS] = "Returns";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_AUTHTRAPS] = "AuthTraps";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRADDRDISCR] =
+        "VTPtrAddressDiscrimination";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_VPTRTYPEDISCR] =
+        "VTPtrTypeDiscrimination";
+    Flags[AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI] = "InitFini";
+
+    static_assert(AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_INITFINI ==
+                      AARCH64_PAUTH_PLATFORM_LLVM_LINUX_VERSION_LAST,
+                  "Update when new enum items are defined");
+
+    std::string Desc;
+    for (uint32_t I = 0, End = Flags.size(); I < End; ++I) {
+      if (!(Version & (1 << I)))
+        Desc += '!';
+      Desc +=
+          Twine("PointerAuth" + Flags[I] + (I == End - 1 ? "" : ", ")).str();
+    }
+    return Desc;
+  }();
+
+  OS << format("platform 0x%" PRIx64 " (%s), version 0x%" PRIx64, Platform,
+               PlatformDesc, Version);
+  if (!VersionDesc.empty())
+    OS << format(" (%s)", VersionDesc.c_str());
+
+  return true;
+}
+
 template <typename ELFT>
 static std::string getGNUProperty(uint32_t Type, uint32_t DataSize,
                                   ArrayRef<uint8_t> Data) {
@@ -5162,6 +5230,9 @@ static std::string getGNUProperty(uint32_t Type, uint32_t DataSize,
     if (PrData)
       OS << format("<unknown flags: 0x%x>", PrData);
     return OS.str();
+  case GNU_PROPERTY_AARCH64_FEATURE_PAUTH:
+    printAArch64PAuthABICoreInfo<ELFT>(OS, DataSize, Data);
+    return OS.str();
   case GNU_PROPERTY_X86_FEATURE_2_NEEDED:
   case GNU_PROPERTY_X86_FEATURE_2_USED:
     OS << "x86 feature "
@@ -5363,29 +5434,6 @@ static bool printAndroidNote(raw_ostream &OS, uint32_t NoteType,
   return true;
 }
 
-template <class ELFT>
-static bool printAArch64Note(raw_ostream &OS, uint32_t NoteType,
-                             ArrayRef<uint8_t> Desc) {
-  if (NoteType != NT_ARM_TYPE_PAUTH_ABI_TAG)
-    return false;
-
-  OS << "    AArch64 PAuth ABI tag: ";
-  if (Desc.size() < 16) {
-    OS << format("<corrupted size: expected at least 16, got %d>", Desc.size());
-    return false;
-  }
-
-  uint64_t Platform = endian::read64<ELFT::Endianness>(Desc.data() + 0);
-  uint64_t Version = endian::read64<ELFT::Endianness>(Desc.data() + 8);
-  OS << format("platform 0x%" PRIx64 ", version 0x%" PRIx64, Platform, Version);
-
-  if (Desc.size() > 16)
-    OS << ", additional info 0x"
-       << toHex(ArrayRef<uint8_t>(Desc.data() + 16, Desc.size() - 16));
-
-  return true;
-}
-
 template <class ELFT>
 void GNUELFDumper<ELFT>::printMemtag(
     const ArrayRef<std::pair<std::string, std::string>> DynamicEntries,
@@ -5783,10 +5831,6 @@ const NoteType AndroidNoteTypes[] = {
      "NT_ANDROID_TYPE_MEMTAG (Android memory tagging information)"},
 };
 
-const NoteType ARMNoteTypes[] = {
-    {ELF::NT_ARM_TYPE_PAUTH_ABI_TAG, "NT_ARM_TYPE_PAUTH_ABI_TAG"},
-};
-
 const NoteType CoreNoteTypes[] = {
     {ELF::NT_PRSTATUS, "NT_PRSTATUS (prstatus structure)"},
     {ELF::NT_FPREGSET, "NT_FPREGSET (floating point registers)"},
@@ -5905,8 +5949,6 @@ StringRef getNoteTypeName(const typename ELFT::Note &Note, unsigned ELFType) {
     return FindNote(LLVMOMPOFFLOADNoteTypes);
   if (Name == "Android")
     return FindNote(AndroidNoteTypes);
-  if (Name == "ARM")
-    return FindNote(ARMNoteTypes);
 
   if (ELFType == ELF::ET_CORE)
     return FindNote(CoreNoteTypes);
@@ -6062,9 +6104,6 @@ template <class ELFT> void GNUELFDumper<ELFT>::printNotes() {
     } else if (Name == "Android") {
       if (printAndroidNote(OS, Type, Descriptor))
         return Error::success();
-    } else if (Name == "ARM") {
-      if (printAArch64Note<ELFT>(OS, Type, Descriptor))
-        return Error::success();
     }
     if (!Descriptor.empty()) {
       OS << "   description data:";
@@ -7702,27 +7741,6 @@ static bool printAndroidNoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
   return true;
 }
 
-template <class ELFT>
-static bool printAarch64NoteLLVMStyle(uint32_t NoteType, ArrayRef<uint8_t> Desc,
-                                      ScopedPrinter &W) {
-  if (NoteType != NT_ARM_TYPE_PAUTH_ABI_TAG)
-    return false;
-
-  if (Desc.size() < 16)
-    return false;
-
-  uint64_t platform = endian::read64<ELFT::Endianness>(Desc.data() + 0);
-  uint64_t version = endian::read64<ELFT::Endianness>(Desc.data() + 8);
-  W.printNumber("Platform", platform);
-  W.printNumber("Version", version);
-
-  if (Desc.size() > 16)
-    W.printString("Additional info",
-                  toHex(ArrayRef<uint8_t>(Desc.data() + 16, Desc.size() - 16)));
-
-  return true;
-}
-
 template <class ELFT>
 void LLVMELFDumper<ELFT>::printMemtag(
     const ArrayRef<std::pair<std::string, std::string>> DynamicEntries,
@@ -7859,9 +7877,6 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printNotes() {
     } else if (Name == "Android") {
       if (printAndroidNoteLLVMStyle(Type, Descriptor, W))
         return Error::success();
-    } else if (Name == "ARM") {
-      if (printAarch64NoteLLVMStyle<ELFT>(Type, Descriptor, W))
-        return Error::success();
     }
     if (!Descriptor.empty()) {
       W.printBinaryBlock("Description data", Descriptor);
-- 
cgit v1.2.3


From d4cd65ecf2546e509f43363f96364c976f49b9da Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Thu, 4 Apr 2024 08:48:11 +0200
Subject: [LVI] Handle range attributes (#86413)

This adds handling of range attribute for return values of Call and
Invoke in getFromRangeMetadata and handling of argument with range
attribute in solveBlockValueNonLocal.
There is one additional check of the range metadata at line 1120 in
getValueFromSimpleICmpCondition that is not covered in this PR as after
https://github.com/llvm/llvm-project/pull/75311 there is no test that
cover that check any more and I have not been able to create a test that
trigger that code.
---
 llvm/lib/Analysis/LazyValueInfo.cpp                | 13 +++-
 .../Transforms/CorrelatedValuePropagation/range.ll | 86 +++++++++++++++++++---
 2 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index b8bc81197c95..6cded828c25f 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -588,10 +588,14 @@ LazyValueInfoImpl::getBlockValue(Value *Val, BasicBlock *BB,
 
 static ValueLatticeElement getFromRangeMetadata(Instruction *BBI) {
   switch (BBI->getOpcode()) {
-  default: break;
-  case Instruction::Load:
+  default:
+    break;
   case Instruction::Call:
   case Instruction::Invoke:
+    if (std::optional<ConstantRange> Range = cast<CallBase>(BBI)->getRange())
+      return ValueLatticeElement::getRange(*Range);
+    [[fallthrough]];
+  case Instruction::Load:
     if (MDNode *Ranges = BBI->getMetadata(LLVMContext::MD_range))
       if (isa<IntegerType>(BBI->getType())) {
         return ValueLatticeElement::getRange(
@@ -706,10 +710,11 @@ std::optional<ValueLatticeElement>
 LazyValueInfoImpl::solveBlockValueNonLocal(Value *Val, BasicBlock *BB) {
   ValueLatticeElement Result;  // Start Undefined.
 
-  // If this is the entry block, we must be asking about an argument.  The
-  // value is overdefined.
+  // If this is the entry block, we must be asking about an argument.
   if (BB->isEntryBlock()) {
     assert(isa<Argument>(Val) && "Unknown live-in to the entry block");
+    if (std::optional<ConstantRange> Range = cast<Argument>(Val)->getRange())
+      return ValueLatticeElement::getRange(*Range);
     return ValueLatticeElement::getOverdefined();
   }
 
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
index cc66cbe7fce6..ce1b591218d1 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
@@ -102,9 +102,9 @@ if.end8:
 define i32 @test4(i32 %c) nounwind {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 2, label [[SW_BB]]
-; CHECK-NEXT:    i32 4, label [[SW_BB]]
+; CHECK-NEXT:      i32 1, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB]]
+; CHECK-NEXT:      i32 4, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    br i1 true, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
@@ -207,8 +207,8 @@ define i1 @test7(i32 %c) nounwind {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 6, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 7, label [[SW_BB]]
+; CHECK-NEXT:      i32 6, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 7, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    ret i1 true
@@ -790,8 +790,8 @@ define i32 @test18(i8 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
-; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
-; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:      i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:      i8 -111, label [[DISPATCH]]
 ; CHECK-NEXT:    ]
 ; CHECK:       target93:
 ; CHECK-NEXT:    ret i32 93
@@ -817,8 +817,8 @@ define i8 @test19(i8 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
-; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
-; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:      i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:      i8 -111, label [[DISPATCH]]
 ; CHECK-NEXT:    ]
 ; CHECK:       target93:
 ; CHECK-NEXT:    ret i8 96
@@ -846,8 +846,8 @@ define i1 @test20(i64 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i64 [[A]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:    i64 0, label [[EXIT2:%.*]]
-; CHECK-NEXT:    i64 -2147483647, label [[EXIT2]]
+; CHECK-NEXT:      i64 0, label [[EXIT2:%.*]]
+; CHECK-NEXT:      i64 -2147483647, label [[EXIT2]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default:
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[B]], 0
@@ -1123,6 +1123,70 @@ else:
   ret i1 true
 }
 
+define i1 @icmp_eq_range_attr(i8 range(i8 1, 0) %i) {
+; CHECK-LABEL: @icmp_eq_range_attr(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_attr(i8 range(i8 -1, 1) %i) {
+; CHECK-LABEL: @neg_icmp_eq_range_attr(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+declare range(i8 1, 0) i8 @returns_non_zero_range_helper()
+declare range(i8 -1, 1) i8 @returns_contain_zero_range_helper()
+
+define i1 @icmp_eq_range_return() {
+; CHECK-LABEL: @icmp_eq_range_return(
+; CHECK-NEXT:    [[I:%.*]] = call i8 @returns_non_zero_range_helper()
+; CHECK-NEXT:    ret i1 false
+;
+  %i = call i8 @returns_non_zero_range_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_return() {
+; CHECK-LABEL: @neg_icmp_eq_range_return(
+; CHECK-NEXT:    [[I:%.*]] = call i8 @returns_contain_zero_range_helper()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i = call i8 @returns_contain_zero_range_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+declare i8 @returns_i8_helper()
+
+define i1 @icmp_eq_range_call() {
+; CHECK-LABEL: @icmp_eq_range_call(
+; CHECK-NEXT:    [[I:%.*]] = call range(i8 1, 0) i8 @returns_i8_helper()
+; CHECK-NEXT:    ret i1 false
+;
+  %i = call range(i8 1, 0) i8 @returns_i8_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_call() {
+; CHECK-LABEL: @neg_icmp_eq_range_call(
+; CHECK-NEXT:    [[I:%.*]] = call range(i8 0, 11) i8 @returns_i8_helper()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i = call range(i8 0, 11) i8 @returns_i8_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
 declare i16 @llvm.ctlz.i16(i16, i1)
 declare i16 @llvm.cttz.i16(i16, i1)
 declare i16 @llvm.ctpop.i16(i16)
-- 
cgit v1.2.3


From d6e458219897fad0e460e663833b2190af48c06d Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann@gmail.com>
Date: Thu, 4 Apr 2024 09:34:37 +0200
Subject: [MLIR][LLVM][Mem2Reg] Relax type equality requirement for load and
 store (#87504)

This commit relaxes Mem2Reg's type equality requirement for the LLVM
dialect's load and store operations. For now, we only allow loads to be
promoted if the reaching definition can be casted into a value of the
target type.

For stores, all type checks are removed, as a non-volatile store that
does not write out the alloca's pointer can always be deleted.
---
 mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp |  38 ++++-
 mlir/test/Dialect/LLVMIR/mem2reg.mlir         | 197 ++++++++++++++++++++++++++
 2 files changed, 231 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index 06c1fdd2eb2d..f8fd3bdaff4a 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -126,6 +126,13 @@ Value LLVM::StoreOp::getStored(const MemorySlot &slot, RewriterBase &rewriter) {
   return getValue();
 }
 
+/// Checks that two types are the same or can be cast into one another.
+static bool areCastCompatible(const DataLayout &layout, Type lhs, Type rhs) {
+  return lhs == rhs || (!isa<LLVM::LLVMStructType, LLVM::LLVMArrayType>(lhs) &&
+                        !isa<LLVM::LLVMStructType, LLVM::LLVMArrayType>(rhs) &&
+                        layout.getTypeSize(lhs) == layout.getTypeSize(rhs));
+}
+
 bool LLVM::LoadOp::canUsesBeRemoved(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
     SmallVectorImpl<OpOperand *> &newBlockingUses,
@@ -138,7 +145,29 @@ bool LLVM::LoadOp::canUsesBeRemoved(
   // be removed (provided it loads the exact stored value and is not
   // volatile).
   return blockingUse == slot.ptr && getAddr() == slot.ptr &&
-         getResult().getType() == slot.elemType && !getVolatile_();
+         areCastCompatible(dataLayout, getResult().getType(), slot.elemType) &&
+         !getVolatile_();
+}
+
+/// Constructs operations that convert `inputValue` into a new value of type
+/// `targetType`. Assumes that this conversion is possible.
+static Value createConversionSequence(RewriterBase &rewriter, Location loc,
+                                      Value inputValue, Type targetType) {
+  if (inputValue.getType() == targetType)
+    return inputValue;
+
+  if (!isa<LLVM::LLVMPointerType>(targetType) &&
+      !isa<LLVM::LLVMPointerType>(inputValue.getType()))
+    return rewriter.createOrFold<LLVM::BitcastOp>(loc, targetType, inputValue);
+
+  if (!isa<LLVM::LLVMPointerType>(targetType))
+    return rewriter.createOrFold<LLVM::PtrToIntOp>(loc, targetType, inputValue);
+
+  if (!isa<LLVM::LLVMPointerType>(inputValue.getType()))
+    return rewriter.createOrFold<LLVM::IntToPtrOp>(loc, targetType, inputValue);
+
+  return rewriter.createOrFold<LLVM::AddrSpaceCastOp>(loc, targetType,
+                                                      inputValue);
 }
 
 DeletionKind LLVM::LoadOp::removeBlockingUses(
@@ -146,7 +175,9 @@ DeletionKind LLVM::LoadOp::removeBlockingUses(
     RewriterBase &rewriter, Value reachingDefinition) {
   // `canUsesBeRemoved` checked this blocking use must be the loaded slot
   // pointer.
-  rewriter.replaceAllUsesWith(getResult(), reachingDefinition);
+  Value newResult = createConversionSequence(
+      rewriter, getLoc(), reachingDefinition, getResult().getType());
+  rewriter.replaceAllUsesWith(getResult(), newResult);
   return DeletionKind::Delete;
 }
 
@@ -161,8 +192,7 @@ bool LLVM::StoreOp::canUsesBeRemoved(
   // fine, provided we are currently promoting its target value. Don't allow a
   // store OF the slot pointer, only INTO the slot pointer.
   return blockingUse == slot.ptr && getAddr() == slot.ptr &&
-         getValue() != slot.ptr && getValue().getType() == slot.elemType &&
-         !getVolatile_();
+         getValue() != slot.ptr && !getVolatile_();
 }
 
 DeletionKind LLVM::StoreOp::removeBlockingUses(
diff --git a/mlir/test/Dialect/LLVMIR/mem2reg.mlir b/mlir/test/Dialect/LLVMIR/mem2reg.mlir
index 90e56c1166ed..d6d5e1bdc93c 100644
--- a/mlir/test/Dialect/LLVMIR/mem2reg.mlir
+++ b/mlir/test/Dialect/LLVMIR/mem2reg.mlir
@@ -697,3 +697,200 @@ llvm.func @transitive_reaching_def() -> !llvm.ptr {
   %3 = llvm.load %1 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr
   llvm.return %3 : !llvm.ptr
 }
+
+// -----
+
+// CHECK-LABEL: @load_int_from_float
+llvm.func @load_int_from_float() -> i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: llvm.alloca
+  %1 = llvm.alloca %0 x f32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
+  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
+  // CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[UNDEF]] : f32 to i32
+  // CHECK: llvm.return %[[BITCAST:.*]]
+  llvm.return %2 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @load_float_from_int
+llvm.func @load_float_from_int() -> f32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: llvm.alloca
+  %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> f32
+  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
+  // CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[UNDEF]] : i32 to f32
+  // CHECK: llvm.return %[[BITCAST:.*]]
+  llvm.return %2 : f32
+}
+
+// -----
+
+// CHECK-LABEL: @load_int_from_vector
+llvm.func @load_int_from_vector() -> i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: llvm.alloca
+  %1 = llvm.alloca %0 x vector<2xi16> : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
+  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
+  // CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[UNDEF]] : vector<2xi16> to i32
+  // CHECK: llvm.return %[[BITCAST:.*]]
+  llvm.return %2 : i32
+}
+
+// -----
+
+// LLVM arrays cannot be bitcasted, so the following cannot be promoted.
+
+// CHECK-LABEL: @load_int_from_array
+llvm.func @load_int_from_array() -> i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: llvm.alloca
+  %1 = llvm.alloca %0 x !llvm.array<2 x i16> : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
+  // CHECK-NOT: llvm.bitcast
+  llvm.return %2 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @store_int_to_float
+// CHECK-SAME: %[[ARG:.*]]: i32
+llvm.func @store_int_to_float(%arg: i32) -> i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: llvm.alloca
+  %1 = llvm.alloca %0 x f32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
+  llvm.store %arg, %1 {alignment = 4 : i64} : i32, !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
+  // CHECK: llvm.return %[[ARG]]
+  llvm.return %2 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @store_float_to_int
+// CHECK-SAME: %[[ARG:.*]]: f32
+llvm.func @store_float_to_int(%arg: f32) -> i32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: llvm.alloca
+  %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
+  llvm.store %arg, %1 {alignment = 4 : i64} : f32, !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
+  // CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[ARG]] : f32 to i32
+  // CHECK: llvm.return %[[BITCAST]]
+  llvm.return %2 : i32
+}
+
+// -----
+
+// CHECK-LABEL: @store_int_to_vector
+// CHECK-SAME: %[[ARG:.*]]: i32
+llvm.func @store_int_to_vector(%arg: i32) -> vector<4xi8> {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: llvm.alloca
+  %1 = llvm.alloca %0 x vector<2xi16> {alignment = 4 : i64} : (i32) -> !llvm.ptr
+  llvm.store %arg, %1 {alignment = 4 : i64} : i32, !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> vector<4xi8>
+  // CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[ARG]] : i32 to vector<4xi8>
+  // CHECK: llvm.return %[[BITCAST]]
+  llvm.return %2 : vector<4xi8>
+}
+
+// -----
+
+// CHECK-LABEL: @load_ptr_from_int
+llvm.func @load_ptr_from_int() -> !llvm.ptr {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: llvm.alloca
+  %1 = llvm.alloca %0 x i64 {alignment = 4 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> !llvm.ptr
+  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
+  // CHECK: %[[CAST:.*]] = llvm.inttoptr %[[UNDEF]] : i64 to !llvm.ptr
+  // CHECK: llvm.return %[[CAST:.*]]
+  llvm.return %2 : !llvm.ptr
+}
+
+// -----
+
+// CHECK-LABEL: @load_int_from_ptr
+llvm.func @load_int_from_ptr() -> i64 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: llvm.alloca
+  %1 = llvm.alloca %0 x !llvm.ptr {alignment = 4 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i64
+  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
+  // CHECK: %[[CAST:.*]] = llvm.ptrtoint %[[UNDEF]] : !llvm.ptr to i64
+  // CHECK: llvm.return %[[CAST:.*]]
+  llvm.return %2 : i64
+}
+
+// -----
+
+// CHECK-LABEL: @load_ptr_addrspace_cast
+llvm.func @load_ptr_addrspace_cast() -> !llvm.ptr<2> {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: llvm.alloca
+  %1 = llvm.alloca %0 x !llvm.ptr<1> {alignment = 4 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> !llvm.ptr<2>
+  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
+  // CHECK: %[[CAST:.*]] = llvm.addrspacecast %[[UNDEF]] : !llvm.ptr<1> to !llvm.ptr<2>
+  // CHECK: llvm.return %[[CAST:.*]]
+  llvm.return %2 : !llvm.ptr<2>
+}
+
+// -----
+
+// CHECK-LABEL: @load_smaller_int
+llvm.func @load_smaller_int() -> i16 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: llvm.alloca
+  %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i16
+  llvm.return %2 : i16
+}
+
+// -----
+
+// CHECK-LABEL: @load_different_type_smaller
+llvm.func @load_different_type_smaller() -> f32 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: llvm.alloca
+  %1 = llvm.alloca %0 x i64 {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> f32
+  llvm.return %2 : f32
+}
+
+// -----
+
+// This alloca is too small for the load, still, mem2reg should not touch it.
+
+// CHECK-LABEL: @impossible_load
+llvm.func @impossible_load() -> f64 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: llvm.alloca
+  %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> f64
+  llvm.return %2 : f64
+}
+
+// -----
+
+// Verifies that mem2reg does not introduce address space casts of pointers
+// with different bitsize.
+
+module attributes { dlti.dl_spec = #dlti.dl_spec<
+  #dlti.dl_entry<!llvm.ptr<1>, dense<[32, 64, 64]> : vector<3xi64>>,
+  #dlti.dl_entry<!llvm.ptr<2>, dense<[64, 64, 64]> : vector<3xi64>>
+>} {
+
+  // CHECK-LABEL: @load_ptr_addrspace_cast_different_size
+  llvm.func @load_ptr_addrspace_cast_different_size() -> !llvm.ptr<2> {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    // CHECK: llvm.alloca
+    %1 = llvm.alloca %0 x !llvm.ptr<1> {alignment = 4 : i64} : (i32) -> !llvm.ptr
+    %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> !llvm.ptr<2>
+    llvm.return %2 : !llvm.ptr<2>
+  }
+}
-- 
cgit v1.2.3


From e69cab7f37cf6f35306356cddb26049fd6138df7 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Thu, 4 Apr 2024 09:37:45 +0200
Subject: [Flang] Make SLES 15 build tests (#87498)

SLES 15 comes with a GCC 7.5 as default, which does not support the
C++17 `<charconv>` header. This results in build errors when trying to
run `check-flang`.
This patch addresses that and uses the older `std::stol` for the string
-> number conversion to allow the SLES 15 buildbot
(https://lab.llvm.org/staging/#/builders/193) to turn green.
---
 flang/unittests/Runtime/Time.cpp | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/flang/unittests/Runtime/Time.cpp b/flang/unittests/Runtime/Time.cpp
index ec0caa743bcd..5c93282bca61 100644
--- a/flang/unittests/Runtime/Time.cpp
+++ b/flang/unittests/Runtime/Time.cpp
@@ -12,7 +12,7 @@
 #include "flang/Runtime/time-intrinsic.h"
 #include <algorithm>
 #include <cctype>
-#include <charconv>
+#include <cerrno>
 #include <string>
 
 using namespace Fortran::runtime;
@@ -104,10 +104,9 @@ TEST(TimeIntrinsics, DateAndTime) {
     EXPECT_TRUE(true);
   } else {
     count_t number{-1};
-    auto [_, ec]{
-        std::from_chars(date.data(), date.data() + date.size(), number)};
-    ASSERT_TRUE(ec != std::errc::invalid_argument &&
-        ec != std::errc::result_out_of_range);
+    // Use stol to allow GCC 7.5 to build tests
+    number = std::stol(date);
+    ASSERT_TRUE(errno != ERANGE);
     EXPECT_GE(number, 0);
     auto year = number / 10000;
     auto month = (number - year * 10000) / 100;
@@ -121,14 +120,15 @@ TEST(TimeIntrinsics, DateAndTime) {
   }
 
   // Validate time is hhmmss.sss or blank.
+  std::string acceptedPattern("hhmmss.sss");
   if (isBlank(time)) {
     EXPECT_TRUE(true);
   } else {
     count_t number{-1};
-    auto [next, ec]{
-        std::from_chars(time.data(), time.data() + date.size(), number)};
-    ASSERT_TRUE(ec != std::errc::invalid_argument &&
-        ec != std::errc::result_out_of_range);
+    // Use stol to allow GCC 7.5 to build tests
+    auto dotPosition = acceptedPattern.find('.');
+    number = std::stol(time.substr(0, dotPosition));
+    ASSERT_TRUE(errno != ERANGE);
     ASSERT_GE(number, 0);
     auto hours = number / 10000;
     auto minutes = (number - hours * 10000) / 100;
@@ -137,15 +137,11 @@ TEST(TimeIntrinsics, DateAndTime) {
     EXPECT_LE(minutes, 59);
     // Accept 60 for leap seconds.
     EXPECT_LE(seconds, 60);
-    ASSERT_TRUE(next != time.data() + time.size());
-    EXPECT_EQ(*next, '.');
+    EXPECT_EQ(time.substr(dotPosition, 1), ".");
 
     count_t milliseconds{-1};
-    ASSERT_TRUE(next + 1 != time.data() + time.size());
-    auto [_, ec2]{
-        std::from_chars(next + 1, time.data() + date.size(), milliseconds)};
-    ASSERT_TRUE(ec2 != std::errc::invalid_argument &&
-        ec2 != std::errc::result_out_of_range);
+    milliseconds = std::stol(time.substr(dotPosition + 1, 3));
+    ASSERT_TRUE(errno != ERANGE);
     EXPECT_GE(milliseconds, 0);
     EXPECT_LE(milliseconds, 999);
   }
@@ -157,10 +153,9 @@ TEST(TimeIntrinsics, DateAndTime) {
     ASSERT_TRUE(zone.size() > 1);
     EXPECT_TRUE(zone[0] == '+' || zone[0] == '-');
     count_t number{-1};
-    auto [next, ec]{
-        std::from_chars(zone.data() + 1, zone.data() + zone.size(), number)};
-    ASSERT_TRUE(ec != std::errc::invalid_argument &&
-        ec != std::errc::result_out_of_range);
+    // Use stol to allow GCC 7.5 to build tests
+    number = std::stol(zone.substr(1, 4));
+    ASSERT_TRUE(errno != ERANGE);
     ASSERT_GE(number, 0);
     auto hours = number / 100;
     auto minutes = number % 100;
-- 
cgit v1.2.3


From d542cb3175f0c5691808b9c50234788c7be1154f Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 4 Apr 2024 16:56:31 +0900
Subject: [mlir][Interfaces][NFC] `ValueBoundsConstraintSet`: Delete dead code
 (#86098)

There is an assertion that the stop condition is not satisfied for the
the starting point at the beginning of `computeBound`. Therefore, that
case does not have to be handled later on in that function.
---
 mlir/lib/Interfaces/ValueBoundsOpInterface.cpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index 99598f2e89d9..9a3185d55d6e 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -286,18 +286,6 @@ LogicalResult ValueBoundsConstraintSet::computeBound(
   Builder b(value.getContext());
   mapOperands.clear();
 
-  if (stopCondition(value, dim)) {
-    // Special case: If the stop condition is satisfied for the input
-    // value/dimension, directly return it.
-    mapOperands.push_back(std::make_pair(value, dim));
-    AffineExpr bound = b.getAffineDimExpr(0);
-    if (type == BoundType::UB)
-      bound = bound + ubAdjustment;
-    resultMap = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
-                               b.getAffineDimExpr(0));
-    return success();
-  }
-
   // Process the backward slice of `value` (i.e., reverse use-def chain) until
   // `stopCondition` is met.
   ValueDim valueDim = std::make_pair(value, dim.value_or(kIndexValue));
-- 
cgit v1.2.3


From 35886dc63a2d024e20c10d2e1cb3f5fa5d9f72cc Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Thu, 4 Apr 2024 09:56:04 +0200
Subject: [clang] Remove an unintended statement, NFC

---
 clang/lib/Sema/SemaTemplate.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index a2b8cc14ca76..d3def13f495d 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2711,7 +2711,6 @@ SmallVector<unsigned> TemplateParamsReferencedInTemplateArgumentList(
         : TemplateParams(TemplateParams.begin(), TemplateParams.end()) {}
 
     bool VisitTemplateTypeParmType(TemplateTypeParmType *TTP) {
-      TTP->getIndex();
       MarkAppeared(TTP->getDecl());
       return true;
     }
-- 
cgit v1.2.3


From 550e09db1ad6e8f28546fa0c24b5582e57e210c4 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 4 Apr 2024 09:56:15 +0200
Subject: [clangd][NFC] Delete dead code

---
 clang-tools-extra/clangd/IncludeCleaner.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/clang-tools-extra/clangd/IncludeCleaner.h b/clang-tools-extra/clangd/IncludeCleaner.h
index 387763de3407..624e2116be7d 100644
--- a/clang-tools-extra/clangd/IncludeCleaner.h
+++ b/clang-tools-extra/clangd/IncludeCleaner.h
@@ -62,15 +62,6 @@ issueIncludeCleanerDiagnostics(ParsedAST &AST, llvm::StringRef Code,
                                const ThreadsafeFS &TFS,
                                HeaderFilter IgnoreHeader = {});
 
-/// Affects whether standard library includes should be considered for
-/// removal. This is off by default for now due to implementation limitations:
-/// - macros are not tracked
-/// - symbol names without a unique associated header are not tracked
-/// - references to std-namespaced C types are not properly tracked:
-///   instead of std::size_t -> <cstddef> we see ::size_t -> <stddef.h>
-/// FIXME: remove this hack once the implementation is good enough.
-void setIncludeCleanerAnalyzesStdlib(bool B);
-
 /// Converts the clangd include representation to include-cleaner
 /// include representation.
 include_cleaner::Includes convertIncludes(const ParsedAST &);
-- 
cgit v1.2.3


From 5e4a44380eed172e9f2954f462e94dac97b8e728 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 4 Apr 2024 17:05:47 +0900
Subject: [mlir][Interfaces][NFC] `ValueBoundsConstraintSet`: Pass stop
 condition in the constructor (#86099)

This commit changes the API of `ValueBoundsConstraintSet`: the stop
condition is now passed to the constructor instead of `processWorklist`.
That makes it easier to add items to the worklist multiple times and
process them in a consistent manner. The current
`ValueBoundsConstraintSet` is passed as a reference to the stop
function, so that the stop function can be defined before the the
`ValueBoundsConstraintSet` is constructed.

This change is in preparation of adding support for branches.
---
 .../Vector/IR/ScalableValueBoundsConstraintSet.h   |  9 ++-
 .../mlir/Interfaces/ValueBoundsOpInterface.h       | 16 +++--
 .../Dialect/Affine/Transforms/ReifyValueBounds.cpp |  6 +-
 .../Dialect/Arith/Transforms/ReifyValueBounds.cpp  |  6 +-
 .../lib/Dialect/Linalg/Transforms/HoistPadding.cpp |  2 +-
 .../Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp  |  2 +-
 .../Vector/IR/ScalableValueBoundsConstraintSet.cpp | 21 ++++--
 mlir/lib/Interfaces/ValueBoundsOpInterface.cpp     | 82 ++++++++++++----------
 .../lib/Dialect/Affine/TestReifyValueBounds.cpp    |  9 ++-
 9 files changed, 90 insertions(+), 63 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h b/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h
index 31e19ff1ad39..67a6581eb2fb 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h
+++ b/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h
@@ -29,9 +29,12 @@ struct ValueBoundsConstraintSet : protected ::mlir::ValueBoundsConstraintSet {
 struct ScalableValueBoundsConstraintSet
     : public llvm::RTTIExtends<ScalableValueBoundsConstraintSet,
                                detail::ValueBoundsConstraintSet> {
-  ScalableValueBoundsConstraintSet(MLIRContext *context, unsigned vscaleMin,
-                                   unsigned vscaleMax)
-      : RTTIExtends(context), vscaleMin(vscaleMin), vscaleMax(vscaleMax){};
+  ScalableValueBoundsConstraintSet(
+      MLIRContext *context,
+      ValueBoundsConstraintSet::StopConditionFn stopCondition,
+      unsigned vscaleMin, unsigned vscaleMax)
+      : RTTIExtends(context, stopCondition), vscaleMin(vscaleMin),
+        vscaleMax(vscaleMax) {};
 
   using RTTIExtends::bound;
   using RTTIExtends::StopConditionFn;
diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
index bdfd689c7ac4..83107a3f5f94 100644
--- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
@@ -117,8 +117,9 @@ public:
   ///
   /// The first parameter of the function is the shaped value/index-typed
   /// value. The second parameter is the dimension in case of a shaped value.
-  using StopConditionFn =
-      function_ref<bool(Value, std::optional<int64_t> /*dim*/)>;
+  /// The third parameter is this constraint set.
+  using StopConditionFn = std::function<bool(
+      Value, std::optional<int64_t> /*dim*/, ValueBoundsConstraintSet &cstr)>;
 
   /// Compute a bound for the given index-typed value or shape dimension size.
   /// The computed bound is stored in `resultMap`. The operands of the bound are
@@ -271,22 +272,20 @@ protected:
   /// An index-typed value or the dimension of a shaped-type value.
   using ValueDim = std::pair<Value, int64_t>;
 
-  ValueBoundsConstraintSet(MLIRContext *ctx);
+  ValueBoundsConstraintSet(MLIRContext *ctx, StopConditionFn stopCondition);
 
   /// Populates the constraint set for a value/map without actually computing
   /// the bound. Returns the position for the value/map (via the return value
   /// and `posOut` output parameter).
   int64_t populateConstraintsSet(Value value,
-                                 std::optional<int64_t> dim = std::nullopt,
-                                 StopConditionFn stopCondition = nullptr);
+                                 std::optional<int64_t> dim = std::nullopt);
   int64_t populateConstraintsSet(AffineMap map, ValueDimList mapOperands,
-                                 StopConditionFn stopCondition = nullptr,
                                  int64_t *posOut = nullptr);
 
   /// Iteratively process all elements on the worklist until an index-typed
   /// value or shaped value meets `stopCondition`. Such values are not processed
   /// any further.
-  void processWorklist(StopConditionFn stopCondition);
+  void processWorklist();
 
   /// Bound the given column in the underlying constraint set by the given
   /// expression.
@@ -333,6 +332,9 @@ protected:
 
   /// Builder for constructing affine expressions.
   Builder builder;
+
+  /// The current stop condition function.
+  StopConditionFn stopCondition = nullptr;
 };
 
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
index 37b36f76d446..117ee8e8701a 100644
--- a/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
@@ -84,7 +84,8 @@ FailureOr<OpFoldResult> mlir::affine::reifyShapedValueDimBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
     int64_t dim, ValueBoundsConstraintSet::StopConditionFn stopCondition,
     bool closedUB) {
-  auto reifyToOperands = [&](Value v, std::optional<int64_t> d) {
+  auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
+                             ValueBoundsConstraintSet &cstr) {
     // We are trying to reify a bound for `value` in terms of the owning op's
     // operands. Construct a stop condition that evaluates to "true" for any SSA
     // value except for `value`. I.e., the bound will be computed in terms of
@@ -100,7 +101,8 @@ FailureOr<OpFoldResult> mlir::affine::reifyShapedValueDimBound(
 FailureOr<OpFoldResult> mlir::affine::reifyIndexValueBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
     ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) {
-  auto reifyToOperands = [&](Value v, std::optional<int64_t> d) {
+  auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
+                             ValueBoundsConstraintSet &cstr) {
     return v != value;
   };
   return reifyValueBound(b, loc, type, value, /*dim=*/std::nullopt,
diff --git a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
index 8d9fd1478aa9..fad221288f19 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
@@ -119,7 +119,8 @@ FailureOr<OpFoldResult> mlir::arith::reifyShapedValueDimBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
     int64_t dim, ValueBoundsConstraintSet::StopConditionFn stopCondition,
     bool closedUB) {
-  auto reifyToOperands = [&](Value v, std::optional<int64_t> d) {
+  auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
+                             ValueBoundsConstraintSet &cstr) {
     // We are trying to reify a bound for `value` in terms of the owning op's
     // operands. Construct a stop condition that evaluates to "true" for any SSA
     // value expect for `value`. I.e., the bound will be computed in terms of
@@ -135,7 +136,8 @@ FailureOr<OpFoldResult> mlir::arith::reifyShapedValueDimBound(
 FailureOr<OpFoldResult> mlir::arith::reifyIndexValueBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
     ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) {
-  auto reifyToOperands = [&](Value v, std::optional<int64_t> d) {
+  auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
+                             ValueBoundsConstraintSet &cstr) {
     return v != value;
   };
   return reifyValueBound(b, loc, type, value, /*dim=*/std::nullopt,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
index b32ea8eebaec..c3a08ce86082 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -468,7 +468,7 @@ HoistPaddingAnalysis::getHoistedPackedTensorSizes(RewriterBase &rewriter,
     FailureOr<OpFoldResult> loopUb = affine::reifyIndexValueBound(
         rewriter, loc, presburger::BoundType::UB, forOp.getUpperBound(),
         /*stopCondition=*/
-        [&](Value v, std::optional<int64_t> d) {
+        [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
           if (v == forOp.getUpperBound())
             return false;
           // Compute a bound that is independent of any affine op results.
diff --git a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
index cb36e0cecf0d..1e13e60068ee 100644
--- a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -58,7 +58,7 @@ struct ForOpInterface
     ValueDimList boundOperands;
     LogicalResult status = ValueBoundsConstraintSet::computeBound(
         bound, boundOperands, BoundType::EQ, yieldedValue, dim,
-        [&](Value v, std::optional<int64_t> d) {
+        [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
           // Stop when reaching a block argument of the loop body.
           if (auto bbArg = llvm::dyn_cast<BlockArgument>(v))
             return bbArg.getOwner()->getParentOp() == forOp;
diff --git a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
index 6d7e3bc70f59..52359fa8a510 100644
--- a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
+++ b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
@@ -47,17 +47,26 @@ ScalableValueBoundsConstraintSet::computeScalableBound(
     unsigned vscaleMax, presburger::BoundType boundType, bool closedUB,
     StopConditionFn stopCondition) {
   using namespace presburger;
-
   assert(vscaleMin <= vscaleMax);
-  ScalableValueBoundsConstraintSet scalableCstr(value.getContext(), vscaleMin,
-                                                vscaleMax);
 
-  int64_t pos = scalableCstr.populateConstraintsSet(value, dim, stopCondition);
+  // No stop condition specified: Keep adding constraints until the worklist
+  // is empty.
+  auto defaultStopCondition = [&](Value v, std::optional<int64_t> dim,
+                                  mlir::ValueBoundsConstraintSet &cstr) {
+    return false;
+  };
+
+  ScalableValueBoundsConstraintSet scalableCstr(
+      value.getContext(), stopCondition ? stopCondition : defaultStopCondition,
+      vscaleMin, vscaleMax);
+  int64_t pos = scalableCstr.populateConstraintsSet(value, dim);
 
   // Project out all variables apart from vscale.
   // This should result in constraints in terms of vscale only.
-  scalableCstr.projectOut(
-      [&](ValueDim p) { return p.first != scalableCstr.getVscaleValue(); });
+  auto projectOutFn = [&](ValueDim p) {
+    return p.first != scalableCstr.getVscaleValue();
+  };
+  scalableCstr.projectOut(projectOutFn);
 
   assert(scalableCstr.cstr.getNumDimAndSymbolVars() ==
              scalableCstr.positionToValueDim.size() &&
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index 9a3185d55d6e..0d362c7efa0a 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -67,8 +67,11 @@ static std::optional<int64_t> getConstantIntValue(OpFoldResult ofr) {
   return std::nullopt;
 }
 
-ValueBoundsConstraintSet::ValueBoundsConstraintSet(MLIRContext *ctx)
-    : builder(ctx) {}
+ValueBoundsConstraintSet::ValueBoundsConstraintSet(
+    MLIRContext *ctx, StopConditionFn stopCondition)
+    : builder(ctx), stopCondition(stopCondition) {
+  assert(stopCondition && "expected non-null stop condition");
+}
 
 char ValueBoundsConstraintSet::ID = 0;
 
@@ -193,7 +196,8 @@ static Operation *getOwnerOfValue(Value value) {
   return value.getDefiningOp();
 }
 
-void ValueBoundsConstraintSet::processWorklist(StopConditionFn stopCondition) {
+void ValueBoundsConstraintSet::processWorklist() {
+  LLVM_DEBUG(llvm::dbgs() << "Processing value bounds worklist...\n");
   while (!worklist.empty()) {
     int64_t pos = worklist.front();
     worklist.pop();
@@ -214,13 +218,19 @@ void ValueBoundsConstraintSet::processWorklist(StopConditionFn stopCondition) {
 
     // Do not process any further if the stop condition is met.
     auto maybeDim = dim == kIndexValue ? std::nullopt : std::make_optional(dim);
-    if (stopCondition(value, maybeDim))
+    if (stopCondition(value, maybeDim, *this)) {
+      LLVM_DEBUG(llvm::dbgs() << "Stop condition met for: " << value
+                              << " (dim: " << maybeDim << ")\n");
       continue;
+    }
 
     // Query `ValueBoundsOpInterface` for constraints. New items may be added to
     // the worklist.
     auto valueBoundsOp =
         dyn_cast<ValueBoundsOpInterface>(getOwnerOfValue(value));
+    LLVM_DEBUG(llvm::dbgs()
+               << "Query value bounds for: " << value
+               << " (owner: " << getOwnerOfValue(value)->getName() << ")\n");
     if (valueBoundsOp) {
       if (dim == kIndexValue) {
         valueBoundsOp.populateBoundsForIndexValue(value, *this);
@@ -229,6 +239,7 @@ void ValueBoundsConstraintSet::processWorklist(StopConditionFn stopCondition) {
       }
       continue;
     }
+    LLVM_DEBUG(llvm::dbgs() << "--> ValueBoundsOpInterface not implemented\n");
 
     // If the op does not implement `ValueBoundsOpInterface`, check if it
     // implements the `DestinationStyleOpInterface`. OpResults of such ops are
@@ -278,8 +289,6 @@ LogicalResult ValueBoundsConstraintSet::computeBound(
     bool closedUB) {
 #ifndef NDEBUG
   assertValidValueDim(value, dim);
-  assert(!stopCondition(value, dim) &&
-         "stop condition should not be satisfied for starting point");
 #endif // NDEBUG
 
   int64_t ubAdjustment = closedUB ? 0 : 1;
@@ -289,9 +298,11 @@ LogicalResult ValueBoundsConstraintSet::computeBound(
   // Process the backward slice of `value` (i.e., reverse use-def chain) until
   // `stopCondition` is met.
   ValueDim valueDim = std::make_pair(value, dim.value_or(kIndexValue));
-  ValueBoundsConstraintSet cstr(value.getContext());
+  ValueBoundsConstraintSet cstr(value.getContext(), stopCondition);
+  assert(!stopCondition(value, dim, cstr) &&
+         "stop condition should not be satisfied for starting point");
   int64_t pos = cstr.insert(value, dim, /*isSymbol=*/false);
-  cstr.processWorklist(stopCondition);
+  cstr.processWorklist();
 
   // Project out all variables (apart from `valueDim`) that do not match the
   // stop condition.
@@ -301,7 +312,7 @@ LogicalResult ValueBoundsConstraintSet::computeBound(
       return false;
     auto maybeDim =
         p.second == kIndexValue ? std::nullopt : std::make_optional(p.second);
-    return !stopCondition(p.first, maybeDim);
+    return !stopCondition(p.first, maybeDim, cstr);
   });
 
   // Compute lower and upper bounds for `valueDim`.
@@ -407,7 +418,7 @@ LogicalResult ValueBoundsConstraintSet::computeDependentBound(
     bool closedUB) {
   return computeBound(
       resultMap, mapOperands, type, value, dim,
-      [&](Value v, std::optional<int64_t> d) {
+      [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
         return llvm::is_contained(dependencies, std::make_pair(v, d));
       },
       closedUB);
@@ -443,7 +454,9 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
   // Reify bounds in terms of any independent values.
   return computeBound(
       resultMap, mapOperands, type, value, dim,
-      [&](Value v, std::optional<int64_t> d) { return isIndependent(v); },
+      [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
+        return isIndependent(v);
+      },
       closedUB);
 }
 
@@ -476,21 +489,19 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
     presburger::BoundType type, AffineMap map, ValueDimList operands,
     StopConditionFn stopCondition, bool closedUB) {
   assert(map.getNumResults() == 1 && "expected affine map with one result");
-  ValueBoundsConstraintSet cstr(map.getContext());
 
-  int64_t pos = 0;
-  if (stopCondition) {
-    cstr.populateConstraintsSet(map, operands, stopCondition, &pos);
-  } else {
-    // No stop condition specified: Keep adding constraints until a bound could
-    // be computed.
-    cstr.populateConstraintsSet(
-        map, operands,
-        [&](Value v, std::optional<int64_t> dim) {
-          return cstr.cstr.getConstantBound64(type, pos).has_value();
-        },
-        &pos);
-  }
+  // Default stop condition if none was specified: Keep adding constraints until
+  // a bound could be computed.
+  int64_t pos;
+  auto defaultStopCondition = [&](Value v, std::optional<int64_t> dim,
+                                  ValueBoundsConstraintSet &cstr) {
+    return cstr.cstr.getConstantBound64(type, pos).has_value();
+  };
+
+  ValueBoundsConstraintSet cstr(
+      map.getContext(), stopCondition ? stopCondition : defaultStopCondition);
+  cstr.populateConstraintsSet(map, operands, &pos);
+
   // Compute constant bound for `valueDim`.
   int64_t ubAdjustment = closedUB ? 0 : 1;
   if (auto bound = cstr.cstr.getConstantBound64(type, pos))
@@ -498,8 +509,9 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
   return failure();
 }
 
-int64_t ValueBoundsConstraintSet::populateConstraintsSet(
-    Value value, std::optional<int64_t> dim, StopConditionFn stopCondition) {
+int64_t
+ValueBoundsConstraintSet::populateConstraintsSet(Value value,
+                                                 std::optional<int64_t> dim) {
 #ifndef NDEBUG
   assertValidValueDim(value, dim);
 #endif // NDEBUG
@@ -507,12 +519,12 @@ int64_t ValueBoundsConstraintSet::populateConstraintsSet(
   AffineMap map =
       AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
                      Builder(value.getContext()).getAffineDimExpr(0));
-  return populateConstraintsSet(map, {{value, dim}}, stopCondition);
+  return populateConstraintsSet(map, {{value, dim}});
 }
 
-int64_t ValueBoundsConstraintSet::populateConstraintsSet(
-    AffineMap map, ValueDimList operands, StopConditionFn stopCondition,
-    int64_t *posOut) {
+int64_t ValueBoundsConstraintSet::populateConstraintsSet(AffineMap map,
+                                                         ValueDimList operands,
+                                                         int64_t *posOut) {
   assert(map.getNumResults() == 1 && "expected affine map with one result");
   int64_t pos = insert(/*isSymbol=*/false);
   if (posOut)
@@ -533,13 +545,7 @@ int64_t ValueBoundsConstraintSet::populateConstraintsSet(
 
   // Process the backward slice of `operands` (i.e., reverse use-def chain)
   // until `stopCondition` is met.
-  if (stopCondition) {
-    processWorklist(stopCondition);
-  } else {
-    // No stop condition specified: Keep adding constraints until the worklist
-    // is empty.
-    processWorklist([](Value v, std::optional<int64_t> dim) { return false; });
-  }
+  processWorklist();
 
   return pos;
 }
diff --git a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
index 5e160b720db6..4b2b1a06341b 100644
--- a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
@@ -117,14 +117,17 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp,
 
       // Prepare stop condition. By default, reify in terms of the op's
       // operands. No stop condition is used when a constant was requested.
-      std::function<bool(Value, std::optional<int64_t>)> stopCondition =
-          [&](Value v, std::optional<int64_t> d) {
+      std::function<bool(Value, std::optional<int64_t>,
+                         ValueBoundsConstraintSet & cstr)>
+          stopCondition = [&](Value v, std::optional<int64_t> d,
+                              ValueBoundsConstraintSet &cstr) {
             // Reify in terms of SSA values that are different from `value`.
             return v != value;
           };
       if (reifyToFuncArgs) {
         // Reify in terms of function block arguments.
-        stopCondition = stopCondition = [](Value v, std::optional<int64_t> d) {
+        stopCondition = [](Value v, std::optional<int64_t> d,
+                           ValueBoundsConstraintSet &cstr) {
           auto bbArg = dyn_cast<BlockArgument>(v);
           if (!bbArg)
             return false;
-- 
cgit v1.2.3


From a2306b65d223212dcfafe12c7299262d8d4fdcb4 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 4 Apr 2024 10:27:08 +0200
Subject: [libc] Refactor `BigInt` (#86137)

This patch moves most of the multiprecision logic to the `multiword`
namespace and simplifies some logic in `BigInt`. It also fully
implements the mask and count functions and increases test coverage.

`math_extras.h` is also reworked to make it more concise.
---
 libc/fuzzing/CMakeLists.txt                        |    1 +
 libc/fuzzing/__support/CMakeLists.txt              |    7 +
 libc/fuzzing/__support/uint_fuzz.cpp               |   70 ++
 libc/src/__support/FPUtil/dyadic_float.h           |    6 +-
 libc/src/__support/UInt.h                          | 1126 ++++++++++----------
 libc/src/__support/float_to_string.h               |    7 +-
 libc/src/__support/integer_literals.h              |   25 +-
 libc/src/__support/math_extras.h                   |  249 ++---
 libc/src/__support/number_pair.h                   |   11 -
 libc/test/src/__support/integer_literals_test.cpp  |   21 +
 libc/test/src/__support/math_extras_test.cpp       |   57 +
 libc/test/src/__support/uint_test.cpp              |  192 +++-
 .../libc/test/src/__support/BUILD.bazel            |    1 +
 13 files changed, 1011 insertions(+), 762 deletions(-)
 create mode 100644 libc/fuzzing/__support/CMakeLists.txt
 create mode 100644 libc/fuzzing/__support/uint_fuzz.cpp

diff --git a/libc/fuzzing/CMakeLists.txt b/libc/fuzzing/CMakeLists.txt
index 82487688af11..816691b4bd44 100644
--- a/libc/fuzzing/CMakeLists.txt
+++ b/libc/fuzzing/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer")
 add_custom_target(libc-fuzzer)
 
+add_subdirectory(__support)
 # TODO(#85680): Re-enable math fuzzing after headers are sorted out
 # add_subdirectory(math)
 add_subdirectory(stdlib)
diff --git a/libc/fuzzing/__support/CMakeLists.txt b/libc/fuzzing/__support/CMakeLists.txt
new file mode 100644
index 000000000000..278e914e3fbe
--- /dev/null
+++ b/libc/fuzzing/__support/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_libc_fuzzer(
+  uint_fuzz
+  SRCS
+    uint_fuzz.cpp
+  DEPENDS
+    libc.src.__support.uint
+)
diff --git a/libc/fuzzing/__support/uint_fuzz.cpp b/libc/fuzzing/__support/uint_fuzz.cpp
new file mode 100644
index 000000000000..f48f00d3b4ba
--- /dev/null
+++ b/libc/fuzzing/__support/uint_fuzz.cpp
@@ -0,0 +1,70 @@
+#include "src/__support/CPP/bit.h"
+#include "src/__support/UInt.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Helper function when using gdb / lldb to set a breakpoint and inspect values.
+template <typename T> void debug_and_trap(const char *msg, T a, T b) {
+  __builtin_trap();
+}
+
+#define DEBUG_AND_TRAP()
+
+#define TEST_BINOP(OP)                                                         \
+  if ((a OP b) != (static_cast<T>(BigInt(a) OP BigInt(b))))                    \
+    debug_and_trap(#OP, a, b);
+
+#define TEST_SHIFTOP(OP)                                                       \
+  if ((a OP b) != (static_cast<T>(BigInt(a) OP b)))                            \
+    debug_and_trap(#OP, a, b);
+
+#define TEST_FUNCTION(FUN)                                                     \
+  if (FUN(a) != FUN(BigInt(a)))                                                \
+    debug_and_trap(#FUN, a, b);
+
+// Test that basic arithmetic operations of BigInt behave like their scalar
+// counterparts.
+template <typename T, typename BigInt> void run_tests(T a, T b) {
+  TEST_BINOP(+)
+  TEST_BINOP(-)
+  TEST_BINOP(*)
+  if (b != 0)
+    TEST_BINOP(/)
+  if (b >= 0 && b < cpp::numeric_limits<T>::digits) {
+    TEST_SHIFTOP(<<)
+    TEST_SHIFTOP(>>)
+  }
+  if constexpr (!BigInt::SIGNED) {
+    TEST_FUNCTION(cpp::has_single_bit)
+    TEST_FUNCTION(cpp::countr_zero)
+    TEST_FUNCTION(cpp::countl_zero)
+    TEST_FUNCTION(cpp::countl_one)
+    TEST_FUNCTION(cpp::countr_one)
+  }
+}
+
+// Reads a T from libfuzzer data.
+template <typename T> T read(const uint8_t *data, size_t &remainder) {
+  T out = 0;
+  constexpr size_t T_SIZE = sizeof(T);
+  const size_t copy_size = remainder < T_SIZE ? remainder : T_SIZE;
+  inline_memcpy(&out, data, copy_size);
+  remainder -= copy_size;
+  return out;
+}
+
+template <typename T, typename BigInt>
+void run_tests(const uint8_t *data, size_t size) {
+  const auto a = read<T>(data, size);
+  const auto b = read<T>(data, size);
+  run_tests<T, BigInt>(a, b);
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  // unsigned
+  run_tests<uint64_t, BigInt<64, false, uint16_t>>(data, size);
+  // signed
+  run_tests<int64_t, BigInt<64, true, uint16_t>>(data, size);
+  return 0;
+}
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 73fd7381c3c8..e0c205f52383 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -58,9 +58,9 @@ template <size_t Bits> struct DyadicFloat {
   // significant bit.
   LIBC_INLINE constexpr DyadicFloat &normalize() {
     if (!mantissa.is_zero()) {
-      int shift_length = static_cast<int>(mantissa.clz());
+      int shift_length = cpp::countl_zero(mantissa);
       exponent -= shift_length;
-      mantissa.shift_left(static_cast<size_t>(shift_length));
+      mantissa <<= static_cast<size_t>(shift_length);
     }
     return *this;
   }
@@ -233,7 +233,7 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
     result.sign = a.sign;
     result.exponent = a.exponent;
     result.mantissa = a.mantissa;
-    if (result.mantissa.add(b.mantissa)) {
+    if (result.mantissa.add_overflow(b.mantissa)) {
       // Mantissa addition overflow.
       result.shift_right(1);
       result.mantissa.val[DyadicFloat<Bits>::MantissaType::WORD_COUNT - 1] |=
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index 282efdba1c5f..c524de38d986 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -14,10 +14,11 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/macros/attributes.h"   // LIBC_INLINE
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/attributes.h"          // LIBC_INLINE
+#include "src/__support/macros/optimization.h"        // LIBC_UNLIKELY
+#include "src/__support/macros/properties/compiler.h" // LIBC_COMPILER_IS_CLANG
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128, LIBC_TYPES_HAS_INT64
-#include "src/__support/math_extras.h" // SumCarry, DiffBorrow
+#include "src/__support/math_extras.h" // add_with_carry, sub_with_borrow
 #include "src/__support/number_pair.h"
 
 #include <stddef.h> // For size_t
@@ -25,71 +26,321 @@
 
 namespace LIBC_NAMESPACE {
 
-namespace internal {
-template <typename T> struct half_width;
+namespace multiword {
 
-template <> struct half_width<uint64_t> : cpp::type_identity<uint32_t> {};
-template <> struct half_width<uint32_t> : cpp::type_identity<uint16_t> {};
+// A type trait mapping unsigned integers to their half-width unsigned
+// counterparts.
+template <typename T> struct half_width;
 template <> struct half_width<uint16_t> : cpp::type_identity<uint8_t> {};
+template <> struct half_width<uint32_t> : cpp::type_identity<uint16_t> {};
+#ifdef LIBC_TYPES_HAS_INT64
+template <> struct half_width<uint64_t> : cpp::type_identity<uint32_t> {};
 #ifdef LIBC_TYPES_HAS_INT128
 template <> struct half_width<__uint128_t> : cpp::type_identity<uint64_t> {};
 #endif // LIBC_TYPES_HAS_INT128
-
+#endif // LIBC_TYPES_HAS_INT64
 template <typename T> using half_width_t = typename half_width<T>::type;
 
-template <typename T> constexpr NumberPair<T> full_mul(T a, T b) {
-  NumberPair<T> pa = split(a);
-  NumberPair<T> pb = split(b);
-  NumberPair<T> prod;
+// An array of two elements that can be used in multiword operations.
+template <typename T> struct DoubleWide final : cpp::array<T, 2> {
+  using UP = cpp::array<T, 2>;
+  using UP::UP;
+  LIBC_INLINE constexpr DoubleWide(T lo, T hi) : UP({lo, hi}) {}
+};
+
+// Converts an unsigned value into a DoubleWide<half_width_t<T>>.
+template <typename T> LIBC_INLINE constexpr auto split(T value) {
+  static_assert(cpp::is_unsigned_v<T>);
+  return cpp::bit_cast<DoubleWide<half_width_t<T>>>(value);
+}
+
+// The low part of a DoubleWide value.
+template <typename T> LIBC_INLINE constexpr T lo(const DoubleWide<T> &value) {
+  return value[0];
+}
+// The high part of a DoubleWide value.
+template <typename T> LIBC_INLINE constexpr T hi(const DoubleWide<T> &value) {
+  return value[1];
+}
+// The low part of an unsigned value.
+template <typename T> LIBC_INLINE constexpr half_width_t<T> lo(T value) {
+  return lo(split(value));
+}
+// The high part of an unsigned value.
+template <typename T> LIBC_INLINE constexpr half_width_t<T> hi(T value) {
+  return hi(split(value));
+}
+
+// Returns 'a' times 'b' in a DoubleWide<word>. Cannot overflow by construction.
+template <typename word>
+LIBC_INLINE constexpr DoubleWide<word> mul2(word a, word b) {
+  if constexpr (cpp::is_same_v<word, uint8_t>) {
+    return split<uint16_t>(uint16_t(a) * uint16_t(b));
+  } else if constexpr (cpp::is_same_v<word, uint16_t>) {
+    return split<uint32_t>(uint32_t(a) * uint32_t(b));
+  }
+#ifdef LIBC_TYPES_HAS_INT64
+  else if constexpr (cpp::is_same_v<word, uint32_t>) {
+    return split<uint64_t>(uint64_t(a) * uint64_t(b));
+  }
+#endif
+#ifdef LIBC_TYPES_HAS_INT128
+  else if constexpr (cpp::is_same_v<word, uint64_t>) {
+    return split<__uint128_t>(__uint128_t(a) * __uint128_t(b));
+  }
+#endif
+  else {
+    using half_word = half_width_t<word>;
+    const auto shiftl = [](word value) -> word {
+      return value << cpp::numeric_limits<half_word>::digits;
+    };
+    const auto shiftr = [](word value) -> word {
+      return value >> cpp::numeric_limits<half_word>::digits;
+    };
+    // Here we do a one digit multiplication where 'a' and 'b' are of type
+    // word. We split 'a' and 'b' into half words and perform the classic long
+    // multiplication with 'a' and 'b' being two-digit numbers.
+
+    //    a      a_hi a_lo
+    //  x b => x b_hi b_lo
+    // ----    -----------
+    //    c         result
+    // We convert 'lo' and 'hi' from 'half_word' to 'word' so multiplication
+    // doesn't overflow.
+    const word a_lo = lo(a);
+    const word b_lo = lo(b);
+    const word a_hi = hi(a);
+    const word b_hi = hi(b);
+    const word step1 = b_lo * a_lo; // no overflow;
+    const word step2 = b_lo * a_hi; // no overflow;
+    const word step3 = b_hi * a_lo; // no overflow;
+    const word step4 = b_hi * a_hi; // no overflow;
+    word lo_digit = step1;
+    word hi_digit = step4;
+    const word no_carry = 0;
+    word carry;
+    word _; // unused carry variable.
+    lo_digit = add_with_carry<word>(lo_digit, shiftl(step2), no_carry, carry);
+    hi_digit = add_with_carry<word>(hi_digit, shiftr(step2), carry, _);
+    lo_digit = add_with_carry<word>(lo_digit, shiftl(step3), no_carry, carry);
+    hi_digit = add_with_carry<word>(hi_digit, shiftr(step3), carry, _);
+    return DoubleWide<word>(lo_digit, hi_digit);
+  }
+}
+
+// In-place 'dst op= rhs' with operation with carry propagation. Returns carry.
+template <typename Function, typename word, size_t N, size_t M>
+LIBC_INLINE constexpr word inplace_binop(Function op_with_carry,
+                                         cpp::array<word, N> &dst,
+                                         const cpp::array<word, M> &rhs) {
+  static_assert(N >= M);
+  word carry_out = 0;
+  for (size_t i = 0; i < N; ++i) {
+    const bool has_rhs_value = i < M;
+    const word rhs_value = has_rhs_value ? rhs[i] : 0;
+    const word carry_in = carry_out;
+    dst[i] = op_with_carry(dst[i], rhs_value, carry_in, carry_out);
+    // stop early when rhs is over and no carry is to be propagated.
+    if (!has_rhs_value && carry_out == 0)
+      break;
+  }
+  return carry_out;
+}
 
-  prod.lo = pa.lo * pb.lo;                    // exact
-  prod.hi = pa.hi * pb.hi;                    // exact
-  NumberPair<T> lo_hi = split(pa.lo * pb.hi); // exact
-  NumberPair<T> hi_lo = split(pa.hi * pb.lo); // exact
+// In-place addition. Returns carry.
+template <typename word, size_t N, size_t M>
+LIBC_INLINE constexpr word add_with_carry(cpp::array<word, N> &dst,
+                                          const cpp::array<word, M> &rhs) {
+  return inplace_binop(LIBC_NAMESPACE::add_with_carry<word>, dst, rhs);
+}
+
+// In-place subtraction. Returns borrow.
+template <typename word, size_t N, size_t M>
+LIBC_INLINE constexpr word sub_with_borrow(cpp::array<word, N> &dst,
+                                           const cpp::array<word, M> &rhs) {
+  return inplace_binop(LIBC_NAMESPACE::sub_with_borrow<word>, dst, rhs);
+}
+
+// In-place multiply-add. Returns carry.
+// i.e., 'dst += b * c'
+template <typename word, size_t N>
+LIBC_INLINE constexpr word mul_add_with_carry(cpp::array<word, N> &dst, word b,
+                                              word c) {
+  return add_with_carry(dst, mul2(b, c));
+}
 
-  constexpr size_t HALF_BIT_WIDTH = sizeof(T) * CHAR_BIT / 2;
+// An array of two elements serving as an accumulator during multiword
+// computations.
+template <typename T> struct Accumulator final : cpp::array<T, 2> {
+  using UP = cpp::array<T, 2>;
+  LIBC_INLINE constexpr Accumulator() : UP({0, 0}) {}
+  LIBC_INLINE constexpr T advance(T carry_in) {
+    auto result = UP::front();
+    UP::front() = UP::back();
+    UP::back() = carry_in;
+    return result;
+  }
+  LIBC_INLINE constexpr T sum() const { return UP::front(); }
+  LIBC_INLINE constexpr T carry() const { return UP::back(); }
+};
 
-  auto r1 = add_with_carry(prod.lo, lo_hi.lo << HALF_BIT_WIDTH, T(0));
-  prod.lo = r1.sum;
-  prod.hi = add_with_carry(prod.hi, lo_hi.hi, r1.carry).sum;
+// In-place multiplication by a single word. Returns carry.
+template <typename word, size_t N>
+LIBC_INLINE constexpr word scalar_multiply_with_carry(cpp::array<word, N> &dst,
+                                                      word x) {
+  Accumulator<word> acc;
+  for (auto &val : dst) {
+    const word carry = mul_add_with_carry(acc, val, x);
+    val = acc.advance(carry);
+  }
+  return acc.carry();
+}
 
-  auto r2 = add_with_carry(prod.lo, hi_lo.lo << HALF_BIT_WIDTH, T(0));
-  prod.lo = r2.sum;
-  prod.hi = add_with_carry(prod.hi, hi_lo.hi, r2.carry).sum;
+// Multiplication of 'lhs' by 'rhs' into 'dst'. Returns carry.
+// This function is safe to use for signed numbers.
+// https://stackoverflow.com/a/20793834
+// https://pages.cs.wisc.edu/%7Emarkhill/cs354/Fall2008/beyond354/int.mult.html
+template <typename word, size_t O, size_t M, size_t N>
+LIBC_INLINE constexpr word multiply_with_carry(cpp::array<word, O> &dst,
+                                               const cpp::array<word, M> &lhs,
+                                               const cpp::array<word, N> &rhs) {
+  static_assert(O >= M + N);
+  Accumulator<word> acc;
+  for (size_t i = 0; i < O; ++i) {
+    const size_t lower_idx = i < N ? 0 : i - N + 1;
+    const size_t upper_idx = i < M ? i : M - 1;
+    word carry = 0;
+    for (size_t j = lower_idx; j <= upper_idx; ++j)
+      carry += mul_add_with_carry(acc, lhs[j], rhs[i - j]);
+    dst[i] = acc.advance(carry);
+  }
+  return acc.carry();
+}
 
-  return prod;
+template <typename word, size_t N>
+LIBC_INLINE constexpr void quick_mul_hi(cpp::array<word, N> &dst,
+                                        const cpp::array<word, N> &lhs,
+                                        const cpp::array<word, N> &rhs) {
+  Accumulator<word> acc;
+  word carry = 0;
+  // First round of accumulation for those at N - 1 in the full product.
+  for (size_t i = 0; i < N; ++i)
+    carry += mul_add_with_carry(acc, lhs[i], rhs[N - 1 - i]);
+  for (size_t i = N; i < 2 * N - 1; ++i) {
+    acc.advance(carry);
+    carry = 0;
+    for (size_t j = i - N + 1; j < N; ++j)
+      carry += mul_add_with_carry(acc, lhs[j], rhs[i - j]);
+    dst[i - N] = acc.sum();
+  }
+  dst.back() = acc.carry();
 }
 
-template <>
-LIBC_INLINE constexpr NumberPair<uint32_t> full_mul<uint32_t>(uint32_t a,
-                                                              uint32_t b) {
-  uint64_t prod = uint64_t(a) * uint64_t(b);
-  NumberPair<uint32_t> result;
-  result.lo = uint32_t(prod);
-  result.hi = uint32_t(prod >> 32);
-  return result;
+template <typename word, size_t N>
+LIBC_INLINE constexpr bool is_negative(cpp::array<word, N> &array) {
+  using signed_word = cpp::make_signed_t<word>;
+  return cpp::bit_cast<signed_word>(array.back()) < 0;
 }
 
+// An enum for the shift function below.
+enum Direction { LEFT, RIGHT };
+
+// A bitwise shift on an array of elements.
+// TODO: Make the result UB when 'offset' is greater or equal to the number of
+// bits in 'array'. This will allow for better code performance.
+template <Direction direction, bool is_signed, typename word, size_t N>
+LIBC_INLINE constexpr cpp::array<word, N> shift(cpp::array<word, N> array,
+                                                size_t offset) {
+  static_assert(direction == LEFT || direction == RIGHT);
+  constexpr size_t WORD_BITS = cpp::numeric_limits<word>::digits;
+  constexpr size_t TOTAL_BITS = N * WORD_BITS;
+  if (LIBC_UNLIKELY(offset == 0))
+    return array;
+  if (LIBC_UNLIKELY(offset >= TOTAL_BITS))
+    return {};
 #ifdef LIBC_TYPES_HAS_INT128
-template <>
-LIBC_INLINE constexpr NumberPair<uint64_t> full_mul<uint64_t>(uint64_t a,
-                                                              uint64_t b) {
-  __uint128_t prod = __uint128_t(a) * __uint128_t(b);
-  NumberPair<uint64_t> result;
-  result.lo = uint64_t(prod);
-  result.hi = uint64_t(prod >> 64);
-  return result;
+  if constexpr (TOTAL_BITS == 128) {
+    using type = cpp::conditional_t<is_signed, __int128_t, __uint128_t>;
+    auto tmp = cpp::bit_cast<type>(array);
+    if constexpr (direction == LEFT)
+      tmp <<= offset;
+    else
+      tmp >>= offset;
+    return cpp::bit_cast<cpp::array<word, N>>(tmp);
+  }
+#endif
+  const bool is_neg = is_signed && is_negative(array);
+  constexpr auto at = [](size_t index) -> int {
+    // reverse iteration when direction == LEFT.
+    if constexpr (direction == LEFT)
+      return int(N) - int(index) - 1;
+    return int(index);
+  };
+  const auto safe_get_at = [&](size_t index) -> word {
+    // return appropriate value when accessing out of bound elements.
+    const int i = at(index);
+    if (i < 0)
+      return 0;
+    if (i >= int(N))
+      return is_neg ? -1 : 0;
+    return array[i];
+  };
+  const size_t index_offset = offset / WORD_BITS;
+  const size_t bit_offset = offset % WORD_BITS;
+#ifdef LIBC_COMPILER_IS_CLANG
+  __builtin_assume(index_offset < N);
+#endif
+  cpp::array<word, N> out = {};
+  for (size_t index = 0; index < N; ++index) {
+    const word part1 = safe_get_at(index + index_offset);
+    const word part2 = safe_get_at(index + index_offset + 1);
+    word &dst = out[at(index)];
+    if (bit_offset == 0)
+      dst = part1; // no crosstalk between parts.
+    else if constexpr (direction == LEFT)
+      dst = (part1 << bit_offset) | (part2 >> (WORD_BITS - bit_offset));
+    else
+      dst = (part1 >> bit_offset) | (part2 << (WORD_BITS - bit_offset));
+  }
+  return out;
 }
-#endif // LIBC_TYPES_HAS_INT128
 
-} // namespace internal
+#define DECLARE_COUNTBIT(NAME, INDEX_EXPR)                                     \
+  template <typename word, size_t N>                                           \
+  LIBC_INLINE constexpr int NAME(const cpp::array<word, N> &val) {             \
+    int bit_count = 0;                                                         \
+    for (size_t i = 0; i < N; ++i) {                                           \
+      const int word_count = cpp::NAME<word>(val[INDEX_EXPR]);                 \
+      bit_count += word_count;                                                 \
+      if (word_count != cpp::numeric_limits<word>::digits)                     \
+        break;                                                                 \
+    }                                                                          \
+    return bit_count;                                                          \
+  }
+
+DECLARE_COUNTBIT(countr_zero, i)         // iterating forward
+DECLARE_COUNTBIT(countr_one, i)          // iterating forward
+DECLARE_COUNTBIT(countl_zero, N - i - 1) // iterating backward
+DECLARE_COUNTBIT(countl_one, N - i - 1)  // iterating backward
+
+} // namespace multiword
 
 template <size_t Bits, bool Signed, typename WordType = uint64_t>
 struct BigInt {
+private:
   static_assert(cpp::is_integral_v<WordType> && cpp::is_unsigned_v<WordType>,
                 "WordType must be unsigned integer.");
 
+  struct Division {
+    BigInt quotient;
+    BigInt remainder;
+  };
+
+public:
   using word_type = WordType;
+  using unsigned_type = BigInt<Bits, false, word_type>;
+  using signed_type = BigInt<Bits, true, word_type>;
+
   LIBC_INLINE_VAR static constexpr bool SIGNED = Signed;
   LIBC_INLINE_VAR static constexpr size_t BITS = Bits;
   LIBC_INLINE_VAR
@@ -100,10 +351,7 @@ struct BigInt {
 
   LIBC_INLINE_VAR static constexpr size_t WORD_COUNT = Bits / WORD_SIZE;
 
-  using unsigned_type = BigInt<BITS, false, word_type>;
-  using signed_type = BigInt<BITS, true, word_type>;
-
-  cpp::array<WordType, WORD_COUNT> val{};
+  cpp::array<WordType, WORD_COUNT> val{}; // zero initialized.
 
   LIBC_INLINE constexpr BigInt() = default;
 
@@ -112,76 +360,67 @@ struct BigInt {
   template <size_t OtherBits, bool OtherSigned>
   LIBC_INLINE constexpr BigInt(
       const BigInt<OtherBits, OtherSigned, WordType> &other) {
-    if (OtherBits >= Bits) {
+    if (OtherBits >= Bits) { // truncate
       for (size_t i = 0; i < WORD_COUNT; ++i)
         val[i] = other[i];
-    } else {
+    } else { // zero or sign extend
       size_t i = 0;
       for (; i < OtherBits / WORD_SIZE; ++i)
         val[i] = other[i];
-      WordType sign = 0;
-      if constexpr (Signed && OtherSigned) {
-        sign = static_cast<WordType>(
-            -static_cast<cpp::make_signed_t<WordType>>(other.is_neg()));
-      }
-      for (; i < WORD_COUNT; ++i)
-        val[i] = sign;
+      extend(i, Signed && other.is_neg());
     }
   }
 
   // Construct a BigInt from a C array.
-  template <size_t N, cpp::enable_if_t<N <= WORD_COUNT, int> = 0>
-  LIBC_INLINE constexpr BigInt(const WordType (&nums)[N]) {
-    size_t min_wordcount = N < WORD_COUNT ? N : WORD_COUNT;
-    size_t i = 0;
-    for (; i < min_wordcount; ++i)
+  template <size_t N> LIBC_INLINE constexpr BigInt(const WordType (&nums)[N]) {
+    static_assert(N == WORD_COUNT);
+    for (size_t i = 0; i < WORD_COUNT; ++i)
       val[i] = nums[i];
+  }
 
-    // If nums doesn't completely fill val, then fill the rest with zeroes.
-    for (; i < WORD_COUNT; ++i)
-      val[i] = 0;
+  LIBC_INLINE constexpr explicit BigInt(
+      const cpp::array<WordType, WORD_COUNT> &words) {
+    val = words;
   }
 
   // Initialize the first word to |v| and the rest to 0.
   template <typename T, typename = cpp::enable_if_t<cpp::is_integral_v<T>>>
   LIBC_INLINE constexpr BigInt(T v) {
-    val[0] = static_cast<WordType>(v);
-
-    if constexpr (WORD_COUNT == 1)
-      return;
-
-    if constexpr (Bits < sizeof(T) * CHAR_BIT) {
-      for (int i = 1; i < WORD_COUNT; ++i) {
-        v >>= WORD_SIZE;
-        val[i] = static_cast<WordType>(v);
+    constexpr size_t T_SIZE = sizeof(T) * CHAR_BIT;
+    const bool is_neg = Signed && (v < 0);
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      if (v == 0) {
+        extend(i, is_neg);
+        return;
       }
-      return;
-    }
-
-    size_t i = 1;
-
-    if constexpr (WORD_SIZE < sizeof(T) * CHAR_BIT)
-      for (; i < sizeof(T) * CHAR_BIT / WORD_SIZE; ++i) {
+      val[i] = static_cast<WordType>(v);
+      if constexpr (T_SIZE > WORD_SIZE)
         v >>= WORD_SIZE;
-        val[i] = static_cast<WordType>(v);
-      }
-
-    WordType sign = (Signed && (v < 0)) ? ~WordType(0) : WordType(0);
-    for (; i < WORD_COUNT; ++i) {
-      val[i] = sign;
+      else
+        v = 0;
     }
   }
+  LIBC_INLINE constexpr BigInt &operator=(const BigInt &other) = default;
 
-  LIBC_INLINE constexpr explicit BigInt(
-      const cpp::array<WordType, WORD_COUNT> &words) {
-    for (size_t i = 0; i < WORD_COUNT; ++i)
-      val[i] = words[i];
+  // constants
+  LIBC_INLINE static constexpr BigInt zero() { return BigInt(); }
+  LIBC_INLINE static constexpr BigInt one() { return BigInt(1); }
+  LIBC_INLINE static constexpr BigInt all_ones() { return ~zero(); }
+  LIBC_INLINE static constexpr BigInt min() {
+    BigInt out;
+    if constexpr (SIGNED)
+      out.set_msb();
+    return out;
+  }
+  LIBC_INLINE static constexpr BigInt max() {
+    BigInt out = all_ones();
+    if constexpr (SIGNED)
+      out.clear_msb();
+    return out;
   }
 
   // TODO: Reuse the Sign type.
-  LIBC_INLINE constexpr bool is_neg() const {
-    return val.back() >> (WORD_SIZE - 1);
-  }
+  LIBC_INLINE constexpr bool is_neg() const { return SIGNED && get_msb(); }
 
   template <typename T> LIBC_INLINE constexpr explicit operator T() const {
     return to<T>();
@@ -191,200 +430,100 @@ struct BigInt {
   LIBC_INLINE constexpr cpp::enable_if_t<
       cpp::is_integral_v<T> && !cpp::is_same_v<T, bool>, T>
   to() const {
+    constexpr size_t T_SIZE = sizeof(T) * CHAR_BIT;
     T lo = static_cast<T>(val[0]);
-
-    constexpr size_t T_BITS = sizeof(T) * CHAR_BIT;
-
-    if constexpr (T_BITS <= WORD_SIZE)
+    if constexpr (T_SIZE <= WORD_SIZE)
       return lo;
-
     constexpr size_t MAX_COUNT =
-        T_BITS > Bits ? WORD_COUNT : T_BITS / WORD_SIZE;
+        T_SIZE > Bits ? WORD_COUNT : T_SIZE / WORD_SIZE;
     for (size_t i = 1; i < MAX_COUNT; ++i)
       lo += static_cast<T>(val[i]) << (WORD_SIZE * i);
-
-    if constexpr (Signed && (T_BITS > Bits)) {
+    if constexpr (Signed && (T_SIZE > Bits)) {
       // Extend sign for negative numbers.
       constexpr T MASK = (~T(0) << Bits);
       if (is_neg())
         lo |= MASK;
     }
-
     return lo;
   }
 
   LIBC_INLINE constexpr explicit operator bool() const { return !is_zero(); }
 
-  LIBC_INLINE constexpr BigInt &operator=(const BigInt &other) = default;
-
   LIBC_INLINE constexpr bool is_zero() const {
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      if (val[i] != 0)
+    for (auto part : val)
+      if (part != 0)
         return false;
-    }
     return true;
   }
 
-  // Add x to this number and store the result in this number.
+  // Add 'rhs' to this number and store the result in this number.
   // Returns the carry value produced by the addition operation.
-  LIBC_INLINE constexpr WordType add(const BigInt &x) {
-    SumCarry<WordType> s{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      s = add_with_carry(val[i], x.val[i], s.carry);
-      val[i] = s.sum;
-    }
-    return s.carry;
+  LIBC_INLINE constexpr WordType add_overflow(const BigInt &rhs) {
+    return multiword::add_with_carry(val, rhs.val);
   }
 
   LIBC_INLINE constexpr BigInt operator+(const BigInt &other) const {
-    BigInt result;
-    SumCarry<WordType> s{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      s = add_with_carry(val[i], other.val[i], s.carry);
-      result.val[i] = s.sum;
-    }
+    BigInt result = *this;
+    result.add_overflow(other);
     return result;
   }
 
   // This will only apply when initializing a variable from constant values, so
   // it will always use the constexpr version of add_with_carry.
   LIBC_INLINE constexpr BigInt operator+(BigInt &&other) const {
-    BigInt result;
-    SumCarry<WordType> s{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      s = add_with_carry(val[i], other.val[i], s.carry);
-      result.val[i] = s.sum;
-    }
-    return result;
+    // We use addition commutativity to reuse 'other' and prevent allocation.
+    other.add_overflow(*this); // Returned carry value is ignored.
+    return other;
   }
 
   LIBC_INLINE constexpr BigInt &operator+=(const BigInt &other) {
-    add(other); // Returned carry value is ignored.
+    add_overflow(other); // Returned carry value is ignored.
     return *this;
   }
 
-  // Subtract x to this number and store the result in this number.
+  // Subtract 'rhs' to this number and store the result in this number.
   // Returns the carry value produced by the subtraction operation.
-  LIBC_INLINE constexpr WordType sub(const BigInt &x) {
-    DiffBorrow<WordType> d{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      d = sub_with_borrow(val[i], x.val[i], d.borrow);
-      val[i] = d.diff;
-    }
-    return d.borrow;
+  LIBC_INLINE constexpr WordType sub_overflow(const BigInt &rhs) {
+    return multiword::sub_with_borrow(val, rhs.val);
   }
 
   LIBC_INLINE constexpr BigInt operator-(const BigInt &other) const {
-    BigInt result;
-    DiffBorrow<WordType> d{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      d = sub_with_borrow(val[i], other.val[i], d.borrow);
-      result.val[i] = d.diff;
-    }
+    BigInt result = *this;
+    result.sub_overflow(other); // Returned carry value is ignored.
     return result;
   }
 
   LIBC_INLINE constexpr BigInt operator-(BigInt &&other) const {
-    BigInt result;
-    DiffBorrow<WordType> d{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      d = sub_with_borrow(val[i], other.val[i], d.borrow);
-      result.val[i] = d.diff;
-    }
+    BigInt result = *this;
+    result.sub_overflow(other); // Returned carry value is ignored.
     return result;
   }
 
   LIBC_INLINE constexpr BigInt &operator-=(const BigInt &other) {
     // TODO(lntue): Set overflow flag / errno when carry is true.
-    sub(other);
+    sub_overflow(other); // Returned carry value is ignored.
     return *this;
   }
 
-  // Multiply this number with x and store the result in this number. It is
-  // implemented using the long multiplication algorithm by splitting the
-  // 64-bit words of this number and |x| in to 32-bit halves but peforming
-  // the operations using 64-bit numbers. This ensures that we don't lose the
-  // carry bits.
-  // Returns the carry value produced by the multiplication operation.
+  // Multiply this number with x and store the result in this number.
   LIBC_INLINE constexpr WordType mul(WordType x) {
-    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      NumberPair<WordType> prod = internal::full_mul(val[i], x);
-      BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-      const WordType carry = partial_sum.add(tmp);
-      val[i] = partial_sum.val[0];
-      partial_sum.val[0] = partial_sum.val[1];
-      partial_sum.val[1] = carry;
-    }
-    return partial_sum.val[1];
-  }
-
-  LIBC_INLINE constexpr BigInt operator*(const BigInt &other) const {
-    if constexpr (Signed) {
-      BigInt<Bits, false, WordType> a(*this);
-      BigInt<Bits, false, WordType> b(other);
-      const bool a_neg = a.is_neg();
-      const bool b_neg = b.is_neg();
-      if (a_neg)
-        a = -a;
-      if (b_neg)
-        b = -b;
-      BigInt<Bits, false, WordType> prod = a * b;
-      if (a_neg != b_neg)
-        prod = -prod;
-      return static_cast<BigInt<Bits, true, WordType>>(prod);
-    } else {
-      if constexpr (WORD_COUNT == 1) {
-        return {val[0] * other.val[0]};
-      } else {
-        BigInt result(0);
-        BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-        WordType carry = 0;
-        for (size_t i = 0; i < WORD_COUNT; ++i) {
-          for (size_t j = 0; j <= i; j++) {
-            NumberPair<WordType> prod =
-                internal::full_mul(val[j], other.val[i - j]);
-            BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-            carry += partial_sum.add(tmp);
-          }
-          result.val[i] = partial_sum.val[0];
-          partial_sum.val[0] = partial_sum.val[1];
-          partial_sum.val[1] = carry;
-          carry = 0;
-        }
-        return result;
-      }
-    }
+    return multiword::scalar_multiply_with_carry(val, x);
   }
 
-  // Return the full product, only unsigned for now.
+  // Return the full product.
   template <size_t OtherBits>
-  LIBC_INLINE constexpr BigInt<Bits + OtherBits, Signed, WordType>
+  LIBC_INLINE constexpr auto
   ful_mul(const BigInt<OtherBits, Signed, WordType> &other) const {
-    BigInt<Bits + OtherBits, Signed, WordType> result(0);
-    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-    WordType carry = 0;
-    constexpr size_t OTHER_WORDCOUNT =
-        BigInt<OtherBits, Signed, WordType>::WORD_COUNT;
-    for (size_t i = 0; i <= WORD_COUNT + OTHER_WORDCOUNT - 2; ++i) {
-      const size_t lower_idx =
-          i < OTHER_WORDCOUNT ? 0 : i - OTHER_WORDCOUNT + 1;
-      const size_t upper_idx = i < WORD_COUNT ? i : WORD_COUNT - 1;
-      for (size_t j = lower_idx; j <= upper_idx; ++j) {
-        NumberPair<WordType> prod =
-            internal::full_mul(val[j], other.val[i - j]);
-        BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-        carry += partial_sum.add(tmp);
-      }
-      result.val[i] = partial_sum.val[0];
-      partial_sum.val[0] = partial_sum.val[1];
-      partial_sum.val[1] = carry;
-      carry = 0;
-    }
-    result.val[WORD_COUNT + OTHER_WORDCOUNT - 1] = partial_sum.val[0];
+    BigInt<Bits + OtherBits, Signed, WordType> result;
+    multiword::multiply_with_carry(result.val, val, other.val);
     return result;
   }
 
+  LIBC_INLINE constexpr BigInt operator*(const BigInt &other) const {
+    // Perform full mul and truncate.
+    return BigInt(ful_mul(other));
+  }
+
   // Fast hi part of the full product.  The normal product `operator*` returns
   // `Bits` least significant bits of the full product, while this function will
   // approximate `Bits` most significant bits of the full product with errors
@@ -407,39 +546,17 @@ struct BigInt {
   //    256      4        16          10            3
   //    512      8        64          36            7
   LIBC_INLINE constexpr BigInt quick_mul_hi(const BigInt &other) const {
-    BigInt result(0);
-    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-    WordType carry = 0;
-    // First round of accumulation for those at WORD_COUNT - 1 in the full
-    // product.
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      NumberPair<WordType> prod =
-          internal::full_mul(val[i], other.val[WORD_COUNT - 1 - i]);
-      BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-      carry += partial_sum.add(tmp);
-    }
-    for (size_t i = WORD_COUNT; i < 2 * WORD_COUNT - 1; ++i) {
-      partial_sum.val[0] = partial_sum.val[1];
-      partial_sum.val[1] = carry;
-      carry = 0;
-      for (size_t j = i - WORD_COUNT + 1; j < WORD_COUNT; ++j) {
-        NumberPair<WordType> prod =
-            internal::full_mul(val[j], other.val[i - j]);
-        BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-        carry += partial_sum.add(tmp);
-      }
-      result.val[i - WORD_COUNT] = partial_sum.val[0];
-    }
-    result.val[WORD_COUNT - 1] = partial_sum.val[1];
+    BigInt result;
+    multiword::quick_mul_hi(result.val, val, other.val);
     return result;
   }
 
-  // pow takes a power and sets this to its starting value to that power. Zero
-  // to the zeroth power returns 1.
+  // BigInt(x).pow_n(n) computes x ^ n.
+  // Note 0 ^ 0 == 1.
   LIBC_INLINE constexpr void pow_n(uint64_t power) {
-    BigInt result = 1;
+    static_assert(!Signed);
+    BigInt result = one();
     BigInt cur_power = *this;
-
     while (power > 0) {
       if ((power % 2) > 0)
         result *= cur_power;
@@ -449,38 +566,23 @@ struct BigInt {
     *this = result;
   }
 
-  // TODO: Make division work correctly for signed integers.
-
-  // div takes another BigInt of the same size and divides this by it. The value
-  // of this will be set to the quotient, and the return value is the remainder.
-  LIBC_INLINE constexpr cpp::optional<BigInt> div(const BigInt &other) {
-    BigInt remainder(0);
-    if (*this < other) {
-      remainder = *this;
-      *this = BigInt(0);
-      return remainder;
-    }
-    if (other == 1) {
-      return remainder;
-    }
-    if (other == 0) {
+  // Performs inplace signed / unsigned division. Returns remainder if not
+  // dividing by zero.
+  // For signed numbers it behaves like C++ signed integer division.
+  // That is by truncating the fractionnal part
+  // https://stackoverflow.com/a/3602857
+  LIBC_INLINE constexpr cpp::optional<BigInt> div(const BigInt &divider) {
+    if (LIBC_UNLIKELY(divider.is_zero()))
       return cpp::nullopt;
-    }
-
-    BigInt quotient(0);
-    BigInt subtractor = other;
-    int cur_bit = static_cast<int>(subtractor.clz() - this->clz());
-    subtractor.shift_left(cur_bit);
-
-    for (; cur_bit >= 0 && *this > 0; --cur_bit, subtractor.shift_right(1)) {
-      if (*this >= subtractor) {
-        this->sub(subtractor);
-        quotient = quotient | (BigInt(1) << cur_bit);
-      }
-    }
-    remainder = *this;
-    *this = quotient;
-    return remainder;
+    if (LIBC_UNLIKELY(divider == BigInt::one()))
+      return BigInt::zero();
+    Division result;
+    if constexpr (SIGNED)
+      result = divide_signed(*this, divider);
+    else
+      result = divide_unsigned(*this, divider);
+    *this = result.quotient;
+    return result.remainder;
   }
 
   // Efficiently perform BigInt / (x * 2^e), where x is a half-word-size
@@ -496,19 +598,16 @@ struct BigInt {
   // computation of each step is now properly contained within WordType.
   //   And finally we perform some extra alignment steps for the remaining bits.
   LIBC_INLINE constexpr cpp::optional<BigInt>
-  div_uint_half_times_pow_2(internal::half_width_t<WordType> x, size_t e) {
-    BigInt remainder(0);
-
-    if (x == 0) {
+  div_uint_half_times_pow_2(multiword::half_width_t<WordType> x, size_t e) {
+    BigInt remainder;
+    if (x == 0)
       return cpp::nullopt;
-    }
     if (e >= Bits) {
       remainder = *this;
-      *this = BigInt<Bits, false, WordType>(0);
+      *this = BigInt<Bits, false, WordType>();
       return remainder;
     }
-
-    BigInt quotient(0);
+    BigInt quotient;
     WordType x_word = static_cast<WordType>(x);
     constexpr size_t LOG2_WORD_SIZE = cpp::bit_width(WORD_SIZE) - 1;
     constexpr size_t HALF_WORD_SIZE = WORD_SIZE >> 1;
@@ -633,189 +732,22 @@ struct BigInt {
     return *this;
   }
 
-  // TODO: remove and use cpp::countl_zero below.
-  [[nodiscard]] LIBC_INLINE constexpr int clz() const {
-    constexpr int word_digits = cpp::numeric_limits<word_type>::digits;
-    int leading_zeroes = 0;
-    for (auto i = val.size(); i > 0;) {
-      --i;
-      const int zeroes = cpp::countl_zero(val[i]);
-      leading_zeroes += zeroes;
-      if (zeroes != word_digits)
-        break;
-    }
-    return leading_zeroes;
-  }
-
-  // TODO: remove and use cpp::countr_zero below.
-  [[nodiscard]] LIBC_INLINE constexpr int ctz() const {
-    constexpr int word_digits = cpp::numeric_limits<word_type>::digits;
-    int trailing_zeroes = 0;
-    for (auto word : val) {
-      const int zeroes = cpp::countr_zero(word);
-      trailing_zeroes += zeroes;
-      if (zeroes != word_digits)
-        break;
-    }
-    return trailing_zeroes;
-  }
-
-  LIBC_INLINE constexpr void shift_left(size_t s) {
-    if constexpr (Bits == WORD_SIZE) {
-      // Use native types if possible.
-      if (s >= WORD_SIZE) {
-        val[0] = 0;
-        return;
-      }
-      val[0] <<= s;
-      return;
-    }
-    if constexpr ((Bits == 64) && (WORD_SIZE == 32)) {
-      // Use builtin 64 bits for 32-bit base type if available;
-      if (s >= 64) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      uint64_t tmp = uint64__t(val[0]) + (uint64_t(val[1]) << 62);
-      tmp <<= s;
-      val[0] = uint32_t(tmp);
-      val[1] = uint32_t(tmp >> 32);
-      return;
-    }
-#ifdef LIBC_TYPES_HAS_INT128
-    if constexpr ((Bits == 128) && (WORD_SIZE == 64)) {
-      // Use builtin 128 bits if available;
-      if (s >= 128) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      __uint128_t tmp = __uint128_t(val[0]) + (__uint128_t(val[1]) << 64);
-      tmp <<= s;
-      val[0] = uint64_t(tmp);
-      val[1] = uint64_t(tmp >> 64);
-      return;
-    }
-#endif // LIBC_TYPES_HAS_INT128
-    if (LIBC_UNLIKELY(s == 0))
-      return;
-
-    const size_t drop = s / WORD_SIZE;  // Number of words to drop
-    const size_t shift = s % WORD_SIZE; // Bits to shift in the remaining words.
-    size_t i = WORD_COUNT;
-
-    if (drop < WORD_COUNT) {
-      i = WORD_COUNT - 1;
-      if (shift > 0) {
-        for (size_t j = WORD_COUNT - 1 - drop; j > 0; --i, --j) {
-          val[i] = (val[j] << shift) | (val[j - 1] >> (WORD_SIZE - shift));
-        }
-        val[i] = val[0] << shift;
-      } else {
-        for (size_t j = WORD_COUNT - 1 - drop; j > 0; --i, --j) {
-          val[i] = val[j];
-        }
-        val[i] = val[0];
-      }
-    }
-
-    for (size_t j = 0; j < i; ++j) {
-      val[j] = 0;
-    }
+  LIBC_INLINE constexpr BigInt &operator<<=(size_t s) {
+    val = multiword::shift<multiword::LEFT, SIGNED>(val, s);
+    return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator<<(size_t s) const {
-    BigInt result(*this);
-    result.shift_left(s);
-    return result;
+    return BigInt(multiword::shift<multiword::LEFT, SIGNED>(val, s));
   }
 
-  LIBC_INLINE constexpr BigInt &operator<<=(size_t s) {
-    shift_left(s);
+  LIBC_INLINE constexpr BigInt &operator>>=(size_t s) {
+    val = multiword::shift<multiword::RIGHT, SIGNED>(val, s);
     return *this;
   }
 
-  LIBC_INLINE constexpr void shift_right(size_t s) {
-    if constexpr ((Bits == 64) && (WORD_SIZE == 32)) {
-      // Use builtin 64 bits if available;
-      if (s >= 64) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      uint64_t tmp = uint64_t(val[0]) + (uint64_t(val[1]) << 32);
-      if constexpr (Signed) {
-        tmp = static_cast<uint64_t>(static_cast<int64_t>(tmp) >> s);
-      } else {
-        tmp >>= s;
-      }
-      val[0] = uint32_t(tmp);
-      val[1] = uint32_t(tmp >> 32);
-      return;
-    }
-#ifdef LIBC_TYPES_HAS_INT128
-    if constexpr ((Bits == 128) && (WORD_SIZE == 64)) {
-      // Use builtin 128 bits if available;
-      if (s >= 128) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      __uint128_t tmp = __uint128_t(val[0]) + (__uint128_t(val[1]) << 64);
-      if constexpr (Signed) {
-        tmp = static_cast<__uint128_t>(static_cast<__int128_t>(tmp) >> s);
-      } else {
-        tmp >>= s;
-      }
-      val[0] = uint64_t(tmp);
-      val[1] = uint64_t(tmp >> 64);
-      return;
-    }
-#endif // LIBC_TYPES_HAS_INT128
-
-    if (LIBC_UNLIKELY(s == 0))
-      return;
-    const size_t drop = s / WORD_SIZE;  // Number of words to drop
-    const size_t shift = s % WORD_SIZE; // Bit shift in the remaining words.
-
-    size_t i = 0;
-    WordType sign = Signed ? is_neg() : 0;
-
-    if (drop < WORD_COUNT) {
-      if (shift > 0) {
-        for (size_t j = drop; j < WORD_COUNT - 1; ++i, ++j) {
-          val[i] = (val[j] >> shift) | (val[j + 1] << (WORD_SIZE - shift));
-        }
-        if constexpr (Signed) {
-          val[i] = static_cast<WordType>(
-              static_cast<cpp::make_signed_t<WordType>>(val[WORD_COUNT - 1]) >>
-              shift);
-        } else {
-          val[i] = val[WORD_COUNT - 1] >> shift;
-        }
-        ++i;
-      } else {
-        for (size_t j = drop; j < WORD_COUNT; ++i, ++j) {
-          val[i] = val[j];
-        }
-      }
-    }
-
-    for (; i < WORD_COUNT; ++i) {
-      val[i] = sign;
-    }
-  }
-
   LIBC_INLINE constexpr BigInt operator>>(size_t s) const {
-    BigInt result(*this);
-    result.shift_right(s);
-    return result;
-  }
-
-  LIBC_INLINE constexpr BigInt &operator>>=(size_t s) {
-    shift_right(s);
-    return *this;
+    return BigInt(multiword::shift<multiword::RIGHT, SIGNED>(val, s));
   }
 
 #define DEFINE_BINOP(OP)                                                       \
@@ -833,10 +765,9 @@ struct BigInt {
     return lhs;                                                                \
   }
 
-  DEFINE_BINOP(&)
-  DEFINE_BINOP(|)
-  DEFINE_BINOP(^)
-
+  DEFINE_BINOP(&) // & and &=
+  DEFINE_BINOP(|) // | and |=
+  DEFINE_BINOP(^) // ^ and ^=
 #undef DEFINE_BINOP
 
   LIBC_INLINE constexpr BigInt operator~() const {
@@ -847,8 +778,8 @@ struct BigInt {
   }
 
   LIBC_INLINE constexpr BigInt operator-() const {
-    BigInt result = ~(*this);
-    result.add(BigInt(1));
+    BigInt result(*this);
+    result.negate();
     return result;
   }
 
@@ -865,24 +796,6 @@ struct BigInt {
     return !(lhs == rhs);
   }
 
-private:
-  LIBC_INLINE friend constexpr int cmp(const BigInt &lhs, const BigInt &rhs) {
-    const auto compare = [](WordType a, WordType b) {
-      return a == b ? 0 : a > b ? 1 : -1;
-    };
-    if constexpr (Signed) {
-      const bool lhs_is_neg = lhs.is_neg();
-      const bool rhs_is_neg = rhs.is_neg();
-      if (lhs_is_neg != rhs_is_neg)
-        return rhs_is_neg ? 1 : -1;
-    }
-    for (size_t i = WORD_COUNT; i-- > 0;)
-      if (auto cmp = compare(lhs[i], rhs[i]); cmp != 0)
-        return cmp;
-    return 0;
-  }
-
-public:
   LIBC_INLINE friend constexpr bool operator>(const BigInt &lhs,
                                               const BigInt &rhs) {
     return cmp(lhs, rhs) > 0;
@@ -901,24 +814,24 @@ public:
   }
 
   LIBC_INLINE constexpr BigInt &operator++() {
-    add(BigInt(1));
+    increment();
     return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator++(int) {
     BigInt oldval(*this);
-    add(BigInt(1));
+    increment();
     return oldval;
   }
 
   LIBC_INLINE constexpr BigInt &operator--() {
-    sub(BigInt(1));
+    decrement();
     return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator--(int) {
     BigInt oldval(*this);
-    sub(BigInt(1));
+    decrement();
     return oldval;
   }
 
@@ -930,9 +843,117 @@ public:
   // Return the i-th word of the number.
   LIBC_INLINE constexpr WordType &operator[](size_t i) { return val[i]; }
 
-  LIBC_INLINE WordType *data() { return val; }
+private:
+  LIBC_INLINE friend constexpr int cmp(const BigInt &lhs, const BigInt &rhs) {
+    constexpr auto compare = [](WordType a, WordType b) {
+      return a == b ? 0 : a > b ? 1 : -1;
+    };
+    if constexpr (Signed) {
+      const bool lhs_is_neg = lhs.is_neg();
+      const bool rhs_is_neg = rhs.is_neg();
+      if (lhs_is_neg != rhs_is_neg)
+        return rhs_is_neg ? 1 : -1;
+    }
+    for (size_t i = WORD_COUNT; i-- > 0;)
+      if (auto cmp = compare(lhs[i], rhs[i]); cmp != 0)
+        return cmp;
+    return 0;
+  }
+
+  LIBC_INLINE constexpr void bitwise_not() {
+    for (auto &part : val)
+      part = ~part;
+  }
+
+  LIBC_INLINE constexpr void negate() {
+    bitwise_not();
+    increment();
+  }
 
-  LIBC_INLINE const WordType *data() const { return val; }
+  LIBC_INLINE constexpr void increment() {
+    multiword::add_with_carry(val, cpp::array<WordType, 1>{1});
+  }
+
+  LIBC_INLINE constexpr void decrement() {
+    multiword::add_with_carry(val, cpp::array<WordType, 1>{1});
+  }
+
+  LIBC_INLINE constexpr void extend(size_t index, bool is_neg) {
+    const WordType value = is_neg ? cpp::numeric_limits<WordType>::max()
+                                  : cpp::numeric_limits<WordType>::min();
+    for (size_t i = index; i < WORD_COUNT; ++i)
+      val[i] = value;
+  }
+
+  LIBC_INLINE constexpr bool get_msb() const {
+    return val.back() >> (WORD_SIZE - 1);
+  }
+
+  LIBC_INLINE constexpr void set_msb() {
+    val.back() |= mask_leading_ones<WordType, 1>();
+  }
+
+  LIBC_INLINE constexpr void clear_msb() {
+    val.back() &= mask_trailing_ones<WordType, WORD_SIZE - 1>();
+  }
+
+  LIBC_INLINE constexpr void set_bit(size_t i) {
+    const size_t word_index = i / WORD_SIZE;
+    val[word_index] |= WordType(1) << (i % WORD_SIZE);
+  }
+
+  LIBC_INLINE constexpr static Division divide_unsigned(const BigInt &dividend,
+                                                        const BigInt &divider) {
+    BigInt remainder = dividend;
+    BigInt quotient;
+    if (remainder >= divider) {
+      BigInt subtractor = divider;
+      int cur_bit = multiword::countl_zero(subtractor.val) -
+                    multiword::countl_zero(remainder.val);
+      subtractor <<= cur_bit;
+      for (; cur_bit >= 0 && remainder > 0; --cur_bit, subtractor >>= 1) {
+        if (remainder < subtractor)
+          continue;
+        remainder -= subtractor;
+        quotient.set_bit(cur_bit);
+      }
+    }
+    return Division{quotient, remainder};
+  }
+
+  LIBC_INLINE constexpr static Division divide_signed(const BigInt &dividend,
+                                                      const BigInt &divider) {
+    // Special case because it is not possible to negate the min value of a
+    // signed integer.
+    if (dividend == min() && divider == min())
+      return Division{one(), zero()};
+    // 1. Convert the dividend and divisor to unsigned representation.
+    unsigned_type udividend(dividend);
+    unsigned_type udivider(divider);
+    // 2. Negate the dividend if it's negative, and similarly for the divisor.
+    const bool dividend_is_neg = dividend.is_neg();
+    const bool divider_is_neg = divider.is_neg();
+    if (dividend_is_neg)
+      udividend.negate();
+    if (divider_is_neg)
+      udivider.negate();
+    // 3. Use unsigned multiword division algorithm.
+    const auto unsigned_result = divide_unsigned(udividend, udivider);
+    // 4. Convert the quotient and remainder to signed representation.
+    Division result;
+    result.quotient = signed_type(unsigned_result.quotient);
+    result.remainder = signed_type(unsigned_result.remainder);
+    // 5. Negate the quotient if the dividend and divisor had opposite signs.
+    if (dividend_is_neg != divider_is_neg)
+      result.quotient.negate();
+    // 6. Negate the remainder if the dividend was negative.
+    if (dividend_is_neg)
+      result.remainder.negate();
+    return result;
+  }
+
+  friend signed_type;
+  friend unsigned_type;
 };
 
 namespace internal {
@@ -962,10 +983,8 @@ using Int = BigInt<Bits, true, internal::WordTypeSelectorT<Bits>>;
 // Provides limits of U/Int<128>.
 template <> class cpp::numeric_limits<UInt<128>> {
 public:
-  LIBC_INLINE static constexpr UInt<128> max() {
-    return UInt<128>({0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff});
-  }
-  LIBC_INLINE static constexpr UInt<128> min() { return UInt<128>(0); }
+  LIBC_INLINE static constexpr UInt<128> max() { return UInt<128>::max(); }
+  LIBC_INLINE static constexpr UInt<128> min() { return UInt<128>::min(); }
   // Meant to match std::numeric_limits interface.
   // NOLINTNEXTLINE(readability-identifier-naming)
   LIBC_INLINE_VAR static constexpr int digits = 128;
@@ -973,12 +992,8 @@ public:
 
 template <> class cpp::numeric_limits<Int<128>> {
 public:
-  LIBC_INLINE static constexpr Int<128> max() {
-    return Int<128>({0xffff'ffff'ffff'ffff, 0x7fff'ffff'ffff'ffff});
-  }
-  LIBC_INLINE static constexpr Int<128> min() {
-    return Int<128>({0, 0x8000'0000'0000'0000});
-  }
+  LIBC_INLINE static constexpr Int<128> max() { return Int<128>::max(); }
+  LIBC_INLINE static constexpr Int<128> min() { return Int<128>::min(); }
   // Meant to match std::numeric_limits interface.
   // NOLINTNEXTLINE(readability-identifier-naming)
   LIBC_INLINE_VAR static constexpr int digits = 128;
@@ -1112,30 +1127,28 @@ has_single_bit(T value) {
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countr_zero(const T &value) {
-  return value.ctz();
+  return multiword::countr_zero(value.val);
 }
 
 // Specialization of cpp::countl_zero ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countl_zero(const T &value) {
-  return value.clz();
+  return multiword::countl_zero(value.val);
 }
 
 // Specialization of cpp::countl_one ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countl_one(T value) {
-  // TODO : Implement a faster version not involving operator~.
-  return cpp::countl_zero<T>(~value);
+  return multiword::countl_one(value.val);
 }
 
 // Specialization of cpp::countr_one ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countr_one(T value) {
-  // TODO : Implement a faster version not involving operator~.
-  return cpp::countr_zero<T>(~value);
+  return multiword::countr_one(value.val);
 }
 
 // Specialization of cpp::bit_width ('bit.h') for BigInt.
@@ -1182,65 +1195,59 @@ rotr(T value, int rotate) {
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
 mask_trailing_ones() {
-  static_assert(!T::SIGNED);
-  if (count == 0)
-    return T();
-  constexpr unsigned T_BITS = CHAR_BIT * sizeof(T);
-  static_assert(count <= T_BITS && "Invalid bit index");
-  using word_type = typename T::word_type;
-  T out;
-  constexpr int CHUNK_INDEX_CONTAINING_BIT =
-      static_cast<int>(count / T::WORD_SIZE);
-  int index = 0;
-  for (auto &word : out.val) {
-    if (index < CHUNK_INDEX_CONTAINING_BIT)
-      word = -1;
-    else if (index > CHUNK_INDEX_CONTAINING_BIT)
-      word = 0;
-    else
-      word = mask_trailing_ones<word_type, count % T::WORD_SIZE>();
-    ++index;
-  }
+  static_assert(!T::SIGNED && count <= T::BITS);
+  if (count == T::BITS)
+    return T::all_ones();
+  constexpr size_t QUOTIENT = count / T::WORD_SIZE;
+  constexpr size_t REMAINDER = count % T::WORD_SIZE;
+  T out; // zero initialized
+  for (size_t i = 0; i <= QUOTIENT; ++i)
+    out[i] = i < QUOTIENT
+                 ? -1
+                 : mask_trailing_ones<typename T::word_type, REMAINDER>();
   return out;
 }
 
 // Specialization of mask_leading_ones ('math_extras.h') for BigInt.
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T> mask_leading_ones() {
-  static_assert(!T::SIGNED);
-  if (count == 0)
-    return T();
-  constexpr unsigned T_BITS = CHAR_BIT * sizeof(T);
-  static_assert(count <= T_BITS && "Invalid bit index");
-  using word_type = typename T::word_type;
-  T out;
-  constexpr int CHUNK_INDEX_CONTAINING_BIT =
-      static_cast<int>((T::BITS - count - 1ULL) / T::WORD_SIZE);
-  int index = 0;
-  for (auto &word : out.val) {
-    if (index < CHUNK_INDEX_CONTAINING_BIT)
-      word = 0;
-    else if (index > CHUNK_INDEX_CONTAINING_BIT)
-      word = -1;
-    else
-      word = mask_leading_ones<word_type, count % T::WORD_SIZE>();
-    ++index;
-  }
+  static_assert(!T::SIGNED && count <= T::BITS);
+  if (count == T::BITS)
+    return T::all_ones();
+  constexpr size_t QUOTIENT = (T::BITS - count - 1U) / T::WORD_SIZE;
+  constexpr size_t REMAINDER = count % T::WORD_SIZE;
+  T out; // zero initialized
+  for (size_t i = QUOTIENT; i < T::WORD_COUNT; ++i)
+    out[i] = i > QUOTIENT
+                 ? -1
+                 : mask_leading_ones<typename T::word_type, REMAINDER>();
   return out;
 }
 
+// Specialization of mask_trailing_zeros ('math_extras.h') for BigInt.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
+mask_trailing_zeros() {
+  return mask_leading_ones<T, T::BITS - count>();
+}
+
+// Specialization of mask_leading_zeros ('math_extras.h') for BigInt.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
+mask_leading_zeros() {
+  return mask_trailing_ones<T, T::BITS - count>();
+}
+
 // Specialization of count_zeros ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 count_zeros(T value) {
   return cpp::popcount(~value);
 }
 
 // Specialization of first_leading_zero ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_leading_zero(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countl_one(value) + 1;
@@ -1248,16 +1255,14 @@ first_leading_zero(T value) {
 
 // Specialization of first_leading_one ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_leading_one(T value) {
   return first_leading_zero(~value);
 }
 
 // Specialization of first_trailing_zero ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_trailing_zero(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countr_zero(~value) + 1;
@@ -1265,8 +1270,7 @@ first_trailing_zero(T value) {
 
 // Specialization of first_trailing_one ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_trailing_one(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countr_zero(value) + 1;
diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
index 1287c3e9a84f..4c59cfd99c2e 100644
--- a/libc/src/__support/float_to_string.h
+++ b/libc/src/__support/float_to_string.h
@@ -689,7 +689,7 @@ template <> class FloatToString<long double> {
 
       wide_int float_as_int = mantissa;
 
-      float_as_int.shift_left(exponent);
+      float_as_int <<= exponent;
       int_block_index = 0;
 
       while (float_as_int > 0) {
@@ -708,10 +708,11 @@ template <> class FloatToString<long double> {
 
       const int SHIFT_AMOUNT = FLOAT_AS_INT_WIDTH + exponent;
       static_assert(EXTRA_INT_WIDTH >= sizeof(long double) * 8);
-      float_as_fixed.shift_left(SHIFT_AMOUNT);
+      float_as_fixed <<= SHIFT_AMOUNT;
 
       // If there are still digits above the decimal point, handle those.
-      if (float_as_fixed.clz() < static_cast<int>(EXTRA_INT_WIDTH)) {
+      if (cpp::countl_zero(float_as_fixed) <
+          static_cast<int>(EXTRA_INT_WIDTH)) {
         UInt<EXTRA_INT_WIDTH> above_decimal_point =
             float_as_fixed >> FLOAT_AS_INT_WIDTH;
 
diff --git a/libc/src/__support/integer_literals.h b/libc/src/__support/integer_literals.h
index de1f88fbd3f3..e99799c3512e 100644
--- a/libc/src/__support/integer_literals.h
+++ b/libc/src/__support/integer_literals.h
@@ -151,12 +151,15 @@ template <size_t N> struct Parser<LIBC_NAMESPACE::UInt<N>> {
 template <typename T>
 LIBC_INLINE constexpr T parse_with_prefix(const char *ptr) {
   using P = Parser<T>;
-  if (ptr[0] == '0' && ptr[1] == 'x')
-    return P::template parse<16>(ptr + 2);
-  else if (ptr[0] == '0' && ptr[1] == 'b')
-    return P::template parse<2>(ptr + 2);
-  else
-    return P::template parse<10>(ptr);
+  if (ptr == nullptr)
+    return T();
+  if (ptr[0] == '0') {
+    if (ptr[1] == 'b')
+      return P::template parse<2>(ptr + 2);
+    if (ptr[1] == 'x')
+      return P::template parse<16>(ptr + 2);
+  }
+  return P::template parse<10>(ptr);
 }
 
 } // namespace internal
@@ -169,6 +172,16 @@ LIBC_INLINE constexpr auto operator""_u256(const char *x) {
   return internal::parse_with_prefix<UInt<256>>(x);
 }
 
+template <typename T> LIBC_INLINE constexpr T parse_bigint(const char *ptr) {
+  if (ptr == nullptr)
+    return T();
+  if (ptr[0] == '-' || ptr[0] == '+') {
+    auto positive = internal::parse_with_prefix<T>(ptr + 1);
+    return ptr[0] == '-' ? -positive : positive;
+  }
+  return internal::parse_with_prefix<T>(ptr);
+}
+
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_INTEGER_LITERALS_H
diff --git a/libc/src/__support/math_extras.h b/libc/src/__support/math_extras.h
index 70a8800b285d..bb6424bfa8e4 100644
--- a/libc/src/__support/math_extras.h
+++ b/libc/src/__support/math_extras.h
@@ -10,9 +10,9 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXTRAS_H
 #define LLVM_LIBC_SRC___SUPPORT_MATH_EXTRAS_H
 
-#include "src/__support/CPP/bit.h"           // countl_one, countr_zero
-#include "src/__support/CPP/limits.h"        // CHAR_BIT, numeric_limits
-#include "src/__support/CPP/type_traits.h"   // is_unsigned_v
+#include "src/__support/CPP/bit.h"         // countl_one, countr_zero
+#include "src/__support/CPP/limits.h"      // CHAR_BIT, numeric_limits
+#include "src/__support/CPP/type_traits.h" // is_unsigned_v, is_constant_evaluated
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 
 namespace LIBC_NAMESPACE {
@@ -32,199 +32,94 @@ mask_trailing_ones() {
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
 mask_leading_ones() {
-  constexpr T MASK(mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>());
-  return T(~MASK); // bitwise NOT performs integer promotion.
+  return T(~mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>());
 }
 
-// Add with carry
-template <typename T> struct SumCarry {
-  T sum;
-  T carry;
-};
-
-// This version is always valid for constexpr.
-template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, SumCarry<T>>
-add_with_carry_const(T a, T b, T carry_in) {
-  T tmp = a + carry_in;
-  T sum = b + tmp;
-  T carry_out = (sum < b) + (tmp < a);
-  return {sum, carry_out};
-}
-
-template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, SumCarry<T>>
-add_with_carry(T a, T b, T carry_in) {
-  return add_with_carry_const<T>(a, b, carry_in);
-}
-
-#if __has_builtin(__builtin_addc)
-// https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned char>
-add_with_carry<unsigned char>(unsigned char a, unsigned char b,
-                              unsigned char carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned char>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned char> result{0, 0};
-    result.sum = __builtin_addcb(a, b, carry_in, &result.carry);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned short>
-add_with_carry<unsigned short>(unsigned short a, unsigned short b,
-                               unsigned short carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned short>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned short> result{0, 0};
-    result.sum = __builtin_addcs(a, b, carry_in, &result.carry);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned int>
-add_with_carry<unsigned int>(unsigned int a, unsigned int b,
-                             unsigned int carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned int>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned int> result{0, 0};
-    result.sum = __builtin_addc(a, b, carry_in, &result.carry);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned long>
-add_with_carry<unsigned long>(unsigned long a, unsigned long b,
-                              unsigned long carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned long>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned long> result{0, 0};
-    result.sum = __builtin_addcl(a, b, carry_in, &result.carry);
-    return result;
-  }
+// Create a bitmask with the count right-most bits set to 0, and all other bits
+// set to 1.  Only unsigned types are allowed.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+mask_trailing_zeros() {
+  return mask_leading_ones<T, CHAR_BIT * sizeof(T) - count>();
 }
 
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned long long>
-add_with_carry<unsigned long long>(unsigned long long a, unsigned long long b,
-                                   unsigned long long carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned long long>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned long long> result{0, 0};
-    result.sum = __builtin_addcll(a, b, carry_in, &result.carry);
-    return result;
-  }
+// Create a bitmask with the count left-most bits set to 0, and all other bits
+// set to 1.  Only unsigned types are allowed.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+mask_leading_zeros() {
+  return mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>();
 }
 
-#endif // __has_builtin(__builtin_addc)
-
-// Subtract with borrow
-template <typename T> struct DiffBorrow {
-  T diff;
-  T borrow;
-};
-
-// This version is always valid for constexpr.
+// Returns whether 'a + b' overflows, the result is stored in 'res'.
 template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, DiffBorrow<T>>
-sub_with_borrow_const(T a, T b, T borrow_in) {
-  T tmp = a - b;
-  T diff = tmp - borrow_in;
-  T borrow_out = (diff > tmp) + (tmp > a);
-  return {diff, borrow_out};
+[[nodiscard]] LIBC_INLINE constexpr bool add_overflow(T a, T b, T &res) {
+  return __builtin_add_overflow(a, b, &res);
 }
 
-// This version is not always valid for constepxr because it's overriden below
-// if builtins are available.
+// Returns whether 'a - b' overflows, the result is stored in 'res'.
 template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, DiffBorrow<T>>
-sub_with_borrow(T a, T b, T borrow_in) {
-  return sub_with_borrow_const<T>(a, b, borrow_in);
+[[nodiscard]] LIBC_INLINE constexpr bool sub_overflow(T a, T b, T &res) {
+  return __builtin_sub_overflow(a, b, &res);
 }
 
-#if __has_builtin(__builtin_subc)
-// https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
-
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned char>
-sub_with_borrow<unsigned char>(unsigned char a, unsigned char b,
-                               unsigned char borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned char>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned char> result{0, 0};
-    result.diff = __builtin_subcb(a, b, borrow_in, &result.borrow);
-    return result;
-  }
-}
+#define RETURN_IF(TYPE, BUILTIN)                                               \
+  if constexpr (cpp::is_same_v<T, TYPE>)                                       \
+    return BUILTIN(a, b, carry_in, carry_out);
 
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned short>
-sub_with_borrow<unsigned short>(unsigned short a, unsigned short b,
-                                unsigned short borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned short>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned short> result{0, 0};
-    result.diff = __builtin_subcs(a, b, borrow_in, &result.borrow);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned int>
-sub_with_borrow<unsigned int>(unsigned int a, unsigned int b,
-                              unsigned int borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned int>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned int> result{0, 0};
-    result.diff = __builtin_subc(a, b, borrow_in, &result.borrow);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned long>
-sub_with_borrow<unsigned long>(unsigned long a, unsigned long b,
-                               unsigned long borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned long>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned long> result{0, 0};
-    result.diff = __builtin_subcl(a, b, borrow_in, &result.borrow);
-    return result;
+// Returns the result of 'a + b' taking into account 'carry_in'.
+// The carry out is stored in 'carry_out' it not 'nullptr', dropped otherwise.
+// We keep the pass by pointer interface for consistency with the intrinsic.
+template <typename T>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+add_with_carry(T a, T b, T carry_in, T &carry_out) {
+  if constexpr (!cpp::is_constant_evaluated()) {
+#if __has_builtin(__builtin_addcb)
+    RETURN_IF(unsigned char, __builtin_addcb)
+#elif __has_builtin(__builtin_addcs)
+    RETURN_IF(unsigned short, __builtin_addcs)
+#elif __has_builtin(__builtin_addc)
+    RETURN_IF(unsigned int, __builtin_addc)
+#elif __has_builtin(__builtin_addcl)
+    RETURN_IF(unsigned long, __builtin_addcl)
+#elif __has_builtin(__builtin_addcll)
+    RETURN_IF(unsigned long long, __builtin_addcll)
+#endif
   }
+  T sum;
+  T carry1 = add_overflow(a, b, sum);
+  T carry2 = add_overflow(sum, carry_in, sum);
+  carry_out = carry1 | carry2;
+  return sum;
 }
 
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned long long>
-sub_with_borrow<unsigned long long>(unsigned long long a, unsigned long long b,
-                                    unsigned long long borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned long long>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned long long> result{0, 0};
-    result.diff = __builtin_subcll(a, b, borrow_in, &result.borrow);
-    return result;
+// Returns the result of 'a - b' taking into account 'carry_in'.
+// The carry out is stored in 'carry_out' it not 'nullptr', dropped otherwise.
+// We keep the pass by pointer interface for consistency with the intrinsic.
+template <typename T>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+sub_with_borrow(T a, T b, T carry_in, T &carry_out) {
+  if constexpr (!cpp::is_constant_evaluated()) {
+#if __has_builtin(__builtin_subcb)
+    RETURN_IF(unsigned char, __builtin_subcb)
+#elif __has_builtin(__builtin_subcs)
+    RETURN_IF(unsigned short, __builtin_subcs)
+#elif __has_builtin(__builtin_subc)
+    RETURN_IF(unsigned int, __builtin_subc)
+#elif __has_builtin(__builtin_subcl)
+    RETURN_IF(unsigned long, __builtin_subcl)
+#elif __has_builtin(__builtin_subcll)
+    RETURN_IF(unsigned long long, __builtin_subcll)
+#endif
   }
+  T sub;
+  T carry1 = sub_overflow(a, b, sub);
+  T carry2 = sub_overflow(sub, carry_in, sub);
+  carry_out = carry1 | carry2;
+  return sub;
 }
 
-#endif // __has_builtin(__builtin_subc)
+#undef RETURN_IF
 
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, int>
diff --git a/libc/src/__support/number_pair.h b/libc/src/__support/number_pair.h
index ee6667b1299f..2f713fc03520 100644
--- a/libc/src/__support/number_pair.h
+++ b/libc/src/__support/number_pair.h
@@ -20,17 +20,6 @@ template <typename T> struct NumberPair {
   T hi = T(0);
 };
 
-template <typename T>
-cpp::enable_if_t<cpp::is_integral_v<T> && cpp::is_unsigned_v<T>,
-                 NumberPair<T>> constexpr split(T a) {
-  constexpr size_t HALF_BIT_WIDTH = sizeof(T) * 4;
-  constexpr T LOWER_HALF_MASK = (T(1) << HALF_BIT_WIDTH) - T(1);
-  NumberPair<T> result;
-  result.lo = a & LOWER_HALF_MASK;
-  result.hi = a >> HALF_BIT_WIDTH;
-  return result;
-}
-
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_NUMBER_PAIR_H
diff --git a/libc/test/src/__support/integer_literals_test.cpp b/libc/test/src/__support/integer_literals_test.cpp
index 5298cf30156e..cbc906aa7c99 100644
--- a/libc/test/src/__support/integer_literals_test.cpp
+++ b/libc/test/src/__support/integer_literals_test.cpp
@@ -133,3 +133,24 @@ TEST(LlvmLibcIntegerLiteralTest, u256) {
       U256_MAX,
       0xFFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF_u256);
 }
+
+TEST(LlvmLibcIntegerLiteralTest, parse_bigint) {
+  using T = LIBC_NAMESPACE::Int<128>;
+  struct {
+    const char *str;
+    T expected;
+  } constexpr TEST_CASES[] = {
+      {"0", 0}, {"-1", -1}, {"+1", 1}, {"-0xFF", -255}, {"-0b11", -3},
+  };
+  for (auto tc : TEST_CASES) {
+    T actual = LIBC_NAMESPACE::parse_bigint<T>(tc.str);
+    EXPECT_EQ(actual, tc.expected);
+  }
+}
+
+TEST(LlvmLibcIntegerLiteralTest, parse_bigint_invalid) {
+  using T = LIBC_NAMESPACE::Int<128>;
+  const T expected; // default construction
+  EXPECT_EQ(LIBC_NAMESPACE::parse_bigint<T>(nullptr), expected);
+  EXPECT_EQ(LIBC_NAMESPACE::parse_bigint<T>(""), expected);
+}
diff --git a/libc/test/src/__support/math_extras_test.cpp b/libc/test/src/__support/math_extras_test.cpp
index e88b3e1d6b68..401e631ea4ba 100644
--- a/libc/test/src/__support/math_extras_test.cpp
+++ b/libc/test/src/__support/math_extras_test.cpp
@@ -101,4 +101,61 @@ TYPED_TEST(LlvmLibcBitTest, CountZeros, UnsignedTypesNoBigInt) {
     EXPECT_EQ(count_zeros<T>(cpp::numeric_limits<T>::max() >> i), i);
 }
 
+using UnsignedTypes = testing::TypeList<
+#if defined(__SIZEOF_INT128__)
+    __uint128_t,
+#endif
+    unsigned char, unsigned short, unsigned int, unsigned long,
+    unsigned long long>;
+
+TYPED_TEST(LlvmLibcBlockMathExtrasTest, add_overflow, UnsignedTypes) {
+  constexpr T ZERO = cpp::numeric_limits<T>::min();
+  constexpr T ONE(1);
+  constexpr T MAX = cpp::numeric_limits<T>::max();
+  constexpr T BEFORE_MAX = MAX - 1;
+
+  const struct {
+    T lhs;
+    T rhs;
+    T sum;
+    bool carry;
+  } TESTS[] = {
+      {ZERO, ONE, ONE, false},       // 0x00 + 0x01 = 0x01
+      {BEFORE_MAX, ONE, MAX, false}, // 0xFE + 0x01 = 0xFF
+      {MAX, ONE, ZERO, true},        // 0xFF + 0x01 = 0x00 (carry)
+      {MAX, MAX, BEFORE_MAX, true},  // 0xFF + 0xFF = 0xFE (carry)
+  };
+  for (auto tc : TESTS) {
+    T sum;
+    bool carry = add_overflow<T>(tc.lhs, tc.rhs, sum);
+    EXPECT_EQ(sum, tc.sum);
+    EXPECT_EQ(carry, tc.carry);
+  }
+}
+
+TYPED_TEST(LlvmLibcBlockMathExtrasTest, sub_overflow, UnsignedTypes) {
+  constexpr T ZERO = cpp::numeric_limits<T>::min();
+  constexpr T ONE(1);
+  constexpr T MAX = cpp::numeric_limits<T>::max();
+  constexpr T BEFORE_MAX = MAX - 1;
+
+  const struct {
+    T lhs;
+    T rhs;
+    T sub;
+    bool carry;
+  } TESTS[] = {
+      {ONE, ZERO, ONE, false},      // 0x01 - 0x00 = 0x01
+      {MAX, MAX, ZERO, false},      // 0xFF - 0xFF = 0x00
+      {ZERO, ONE, MAX, true},       // 0x00 - 0x01 = 0xFF (carry)
+      {BEFORE_MAX, MAX, MAX, true}, // 0xFE - 0xFF = 0xFF (carry)
+  };
+  for (auto tc : TESTS) {
+    T sub;
+    bool carry = sub_overflow<T>(tc.lhs, tc.rhs, sub);
+    EXPECT_EQ(sub, tc.sub);
+    EXPECT_EQ(carry, tc.carry);
+  }
+}
+
 } // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp
index 5764324ca288..5696e54c73f3 100644
--- a/libc/test/src/__support/uint_test.cpp
+++ b/libc/test/src/__support/uint_test.cpp
@@ -8,6 +8,7 @@
 
 #include "src/__support/CPP/optional.h"
 #include "src/__support/UInt.h"
+#include "src/__support/integer_literals.h"        // parse_unsigned_bigint
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
 
 #include "include/llvm-libc-macros/math-macros.h" // HUGE_VALF, HUGE_VALF
@@ -15,6 +16,195 @@
 
 namespace LIBC_NAMESPACE {
 
+enum Value { ZERO, ONE, TWO, MIN, MAX };
+
+template <typename T> auto create(Value value) {
+  switch (value) {
+  case ZERO:
+    return T(0);
+  case ONE:
+    return T(1);
+  case TWO:
+    return T(2);
+  case MIN:
+    return T::min();
+  case MAX:
+    return T::max();
+  }
+}
+
+using Types = testing::TypeList< //
+#ifdef LIBC_TYPES_HAS_INT64
+    BigInt<64, false, uint64_t>, // 64-bits unsigned (1 x uint64_t)
+    BigInt<64, true, uint64_t>,  // 64-bits   signed (1 x uint64_t)
+#endif
+#ifdef LIBC_TYPES_HAS_INT128
+    BigInt<128, false, __uint128_t>, // 128-bits unsigned (1 x __uint128_t)
+    BigInt<128, true, __uint128_t>,  // 128-bits   signed (1 x __uint128_t)
+#endif
+    BigInt<16, false, uint16_t>, // 16-bits unsigned (1 x uint16_t)
+    BigInt<16, true, uint16_t>,  // 16-bits   signed (1 x uint16_t)
+    BigInt<64, false, uint16_t>, // 64-bits unsigned (4 x uint16_t)
+    BigInt<64, true, uint16_t>   // 64-bits   signed (4 x uint16_t)
+    >;
+
+#define ASSERT_SAME(A, B) ASSERT_TRUE((A) == (B))
+
+TYPED_TEST(LlvmLibcUIntClassTest, Additions, Types) {
+  ASSERT_SAME(create<T>(ZERO) + create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) + create<T>(ZERO), create<T>(ONE));
+  ASSERT_SAME(create<T>(ZERO) + create<T>(ONE), create<T>(ONE));
+  ASSERT_SAME(create<T>(ONE) + create<T>(ONE), create<T>(TWO));
+  // 2's complement addition works for signed and unsigned types.
+  // - unsigned : 0xff + 0x01 = 0x00 (255 + 1 = 0)
+  // -   signed : 0xef + 0x01 = 0xf0 (127 + 1 = -128)
+  ASSERT_SAME(create<T>(MAX) + create<T>(ONE), create<T>(MIN));
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Subtraction, Types) {
+  ASSERT_SAME(create<T>(ZERO) - create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) - create<T>(ONE), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) - create<T>(ZERO), create<T>(ONE));
+  // 2's complement subtraction works for signed and unsigned types.
+  // - unsigned : 0x00 - 0x01 = 0xff (   0 - 1 = 255)
+  // -   signed : 0xf0 - 0x01 = 0xef (-128 - 1 = 127)
+  ASSERT_SAME(create<T>(MIN) - create<T>(ONE), create<T>(MAX));
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Multiplication, Types) {
+  ASSERT_SAME(create<T>(ZERO) * create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ZERO) * create<T>(ONE), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) * create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) * create<T>(ONE), create<T>(ONE));
+  ASSERT_SAME(create<T>(ONE) * create<T>(TWO), create<T>(TWO));
+  ASSERT_SAME(create<T>(TWO) * create<T>(ONE), create<T>(TWO));
+  // - unsigned : 0xff x 0xff = 0x01 (mod 0xff)
+  // -   signed : 0xef x 0xef = 0x01 (mod 0xff)
+  ASSERT_SAME(create<T>(MAX) * create<T>(MAX), create<T>(ONE));
+}
+
+template <typename T> void print(const char *msg, T value) {
+  testing::tlog << msg;
+  IntegerToString<T, radix::Hex> buffer(value);
+  testing::tlog << buffer.view() << "\n";
+}
+
+TEST(LlvmLibcUIntClassTest, SignedAddSub) {
+  // Computations performed by https://www.wolframalpha.com/
+  using T = BigInt<128, true, uint32_t>;
+  const T a = parse_bigint<T>("1927508279017230597");
+  const T b = parse_bigint<T>("278789278723478925");
+  const T s = parse_bigint<T>("2206297557740709522");
+  // Addition
+  ASSERT_SAME(a + b, s);
+  ASSERT_SAME(b + a, s); // commutative
+  // Subtraction
+  ASSERT_SAME(a - s, -b);
+  ASSERT_SAME(s - a, b);
+}
+
+TEST(LlvmLibcUIntClassTest, SignedMulDiv) {
+  // Computations performed by https://www.wolframalpha.com/
+  using T = BigInt<128, true, uint16_t>;
+  struct {
+    const char *a;
+    const char *b;
+    const char *mul;
+  } const test_cases[] = {{"-4", "3", "-12"},
+                          {"-3", "-3", "9"},
+                          {"1927508279017230597", "278789278723478925",
+                           "537368642840747885329125014794668225"}};
+  for (auto tc : test_cases) {
+    const T a = parse_bigint<T>(tc.a);
+    const T b = parse_bigint<T>(tc.b);
+    const T mul = parse_bigint<T>(tc.mul);
+    // Multiplication
+    ASSERT_SAME(a * b, mul);
+    ASSERT_SAME(b * a, mul);   // commutative
+    ASSERT_SAME(a * -b, -mul); // sign
+    ASSERT_SAME(-a * b, -mul); // sign
+    ASSERT_SAME(-a * -b, mul); // sign
+    // Division
+    ASSERT_SAME(mul / a, b);
+    ASSERT_SAME(mul / b, a);
+    ASSERT_SAME(-mul / a, -b); // sign
+    ASSERT_SAME(mul / -a, -b); // sign
+    ASSERT_SAME(-mul / -a, b); // sign
+  }
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Division, Types) {
+  ASSERT_SAME(create<T>(ZERO) / create<T>(ONE), create<T>(ZERO));
+  ASSERT_SAME(create<T>(MAX) / create<T>(ONE), create<T>(MAX));
+  ASSERT_SAME(create<T>(MAX) / create<T>(MAX), create<T>(ONE));
+  ASSERT_SAME(create<T>(ONE) / create<T>(ONE), create<T>(ONE));
+  if constexpr (T::SIGNED) {
+    // Special case found by fuzzing.
+    ASSERT_SAME(create<T>(MIN) / create<T>(MIN), create<T>(ONE));
+  }
+  // - unsigned : 0xff / 0x02 = 0x7f
+  // -   signed : 0xef / 0x02 = 0x77
+  ASSERT_SAME(create<T>(MAX) / create<T>(TWO), (create<T>(MAX) >> 1));
+
+  using word_type = typename T::word_type;
+  const T zero_one_repeated = T::all_ones() / T(0xff);
+  const word_type pattern = word_type(~0) / word_type(0xff);
+  for (const word_type part : zero_one_repeated.val) {
+    if constexpr (T::SIGNED == false) {
+      EXPECT_EQ(part, pattern);
+    }
+  }
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, is_neg, Types) {
+  EXPECT_FALSE(create<T>(ZERO).is_neg());
+  EXPECT_FALSE(create<T>(ONE).is_neg());
+  EXPECT_FALSE(create<T>(TWO).is_neg());
+  EXPECT_EQ(create<T>(MIN).is_neg(), T::SIGNED);
+  EXPECT_FALSE(create<T>(MAX).is_neg());
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Masks, Types) {
+  if constexpr (!T::SIGNED) {
+    constexpr size_t BITS = T::BITS;
+    // mask_trailing_ones
+    ASSERT_SAME((mask_trailing_ones<T, 0>()), T::zero());
+    ASSERT_SAME((mask_trailing_ones<T, 1>()), T::one());
+    ASSERT_SAME((mask_trailing_ones<T, BITS - 1>()), T::all_ones() >> 1);
+    ASSERT_SAME((mask_trailing_ones<T, BITS>()), T::all_ones());
+    // mask_leading_ones
+    ASSERT_SAME((mask_leading_ones<T, 0>()), T::zero());
+    ASSERT_SAME((mask_leading_ones<T, 1>()), T::one() << (BITS - 1));
+    ASSERT_SAME((mask_leading_ones<T, BITS - 1>()), T::all_ones() - T::one());
+    ASSERT_SAME((mask_leading_ones<T, BITS>()), T::all_ones());
+    // mask_trailing_zeros
+    ASSERT_SAME((mask_trailing_zeros<T, 0>()), T::all_ones());
+    ASSERT_SAME((mask_trailing_zeros<T, 1>()), T::all_ones() - T::one());
+    ASSERT_SAME((mask_trailing_zeros<T, BITS - 1>()), T::one() << (BITS - 1));
+    ASSERT_SAME((mask_trailing_zeros<T, BITS>()), T::zero());
+    // mask_trailing_zeros
+    ASSERT_SAME((mask_leading_zeros<T, 0>()), T::all_ones());
+    ASSERT_SAME((mask_leading_zeros<T, 1>()), T::all_ones() >> 1);
+    ASSERT_SAME((mask_leading_zeros<T, BITS - 1>()), T::one());
+    ASSERT_SAME((mask_leading_zeros<T, BITS>()), T::zero());
+  }
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, CountBits, Types) {
+  if constexpr (!T::SIGNED) {
+    for (size_t i = 0; i <= T::BITS; ++i) {
+      const auto l_one = T::all_ones() << i; // 0b111...000
+      const auto r_one = T::all_ones() >> i; // 0b000...111
+      const int zeros = i;
+      const int ones = T::BITS - zeros;
+      ASSERT_EQ(cpp::countr_one(r_one), ones);
+      ASSERT_EQ(cpp::countl_one(l_one), ones);
+      ASSERT_EQ(cpp::countr_zero(l_one), zeros);
+      ASSERT_EQ(cpp::countl_zero(r_one), zeros);
+    }
+  }
+}
+
 using LL_UInt64 = UInt<64>;
 // We want to test UInt<128> explicitly. So, for
 // convenience, we use a sugar which does not conflict with the UInt128 type
@@ -561,7 +751,7 @@ TEST(LlvmLibcUIntClassTest, FullMulTests) {
     LL_UInt##Bits a = ~LL_UInt##Bits(0);                                       \
     LL_UInt##Bits hi = a.quick_mul_hi(a);                                      \
     LL_UInt##Bits trunc = static_cast<LL_UInt##Bits>(a.ful_mul(a) >> Bits);    \
-    uint64_t overflow = trunc.sub(hi);                                         \
+    uint64_t overflow = trunc.sub_overflow(hi);                                \
     EXPECT_EQ(overflow, uint64_t(0));                                          \
     EXPECT_LE(uint64_t(trunc), uint64_t(Error));                               \
   } while (0)
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
index 4f976122967c..c0d402a89ea3 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
@@ -87,6 +87,7 @@ libc_test(
     srcs = ["uint_test.cpp"],
     deps = [
         "//libc:__support_cpp_optional",
+        "//libc:__support_integer_literals",
         "//libc:__support_macros_properties_types",
         "//libc:__support_uint",
         "//libc:llvm_libc_macros_math_macros",
-- 
cgit v1.2.3


From 47e996d89d4d1e229451594d4b0752b71e8e231c Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy <vyacheslav.levytskyy@intel.com>
Date: Thu, 4 Apr 2024 10:50:35 +0200
Subject: [SPIR-V] Fix OpVariable instructions place in a function (#87554)

This PR:
* fixes OpVariable instructions place in a function (see
https://github.com/llvm/llvm-project/issues/66261),
* improves type inference,
* helps avoiding unneeded bitcasts when validating function call's

This allows to improve existing and add new test cases with more strict
checks. OpVariable fix refers to "All OpVariable instructions in a
function must be the first instructions in the first block" requirement
from SPIR-V spec.
---
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp        |  8 ++-
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h        |  6 +--
 llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp        |  2 +-
 llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp | 19 ++++++-
 llvm/test/CodeGen/SPIRV/OpVariable_order.ll        | 14 +++--
 .../SPIRV/pointers/type-deduce-by-call-chain.ll    | 26 +++++++++-
 .../SPIRV/pointers/type-deduce-call-no-bitcast.ll  | 60 ++++++++++++++++++++++
 7 files changed, 122 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 1674cef7cb82..9e4ba2191366 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -243,8 +243,12 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
       continue;
 
     MetadataAsValue *VMD = cast<MetadataAsValue>(II->getOperand(1));
-    SPIRVType *ElementType = GR->getOrCreateSPIRVType(
-        cast<ConstantAsMetadata>(VMD->getMetadata())->getType(), MIRBuilder);
+    Type *ElementTy = cast<ConstantAsMetadata>(VMD->getMetadata())->getType();
+    if (isUntypedPointerTy(ElementTy))
+      ElementTy =
+          TypedPointerType::get(IntegerType::getInt8Ty(II->getContext()),
+                                getPointerAddressSpace(ElementTy));
+    SPIRVType *ElementType = GR->getOrCreateSPIRVType(ElementTy, MIRBuilder);
     return GR->getOrCreateSPIRVPointerType(
         ElementType, MIRBuilder,
         addressSpaceToStorageClass(
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index e0099e529447..ac799374adce 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -47,7 +47,7 @@ class SPIRVGlobalRegistry {
   DenseMap<const MachineOperand *, const Function *> InstrToFunction;
   // Maps Functions to their calls (in a form of the machine instruction,
   // OpFunctionCall) that happened before the definition is available
-  DenseMap<const Function *, SmallVector<MachineInstr *>> ForwardCalls;
+  DenseMap<const Function *, SmallPtrSet<MachineInstr *, 8>> ForwardCalls;
 
   // Look for an equivalent of the newType in the map. Return the equivalent
   // if it's found, otherwise insert newType to the map and return the type.
@@ -215,12 +215,12 @@ public:
     if (It == ForwardCalls.end())
       ForwardCalls[F] = {MI};
     else
-      It->second.push_back(MI);
+      It->second.insert(MI);
   }
 
   // Map a Function to the vector of machine instructions that represents
   // forward function calls or to nullptr if not found.
-  SmallVector<MachineInstr *> *getForwardCalls(const Function *F) {
+  SmallPtrSet<MachineInstr *, 8> *getForwardCalls(const Function *F) {
     auto It = ForwardCalls.find(F);
     return It == ForwardCalls.end() ? nullptr : &It->second;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 90a31551f45a..d450078d793f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -193,7 +193,7 @@ void validateForwardCalls(const SPIRVSubtarget &STI,
                           MachineRegisterInfo *DefMRI, SPIRVGlobalRegistry &GR,
                           MachineInstr &FunDef) {
   const Function *F = GR.getFunctionByDefinition(&FunDef);
-  if (SmallVector<MachineInstr *> *FwdCalls = GR.getForwardCalls(F))
+  if (SmallPtrSet<MachineInstr *, 8> *FwdCalls = GR.getForwardCalls(F))
     for (MachineInstr *FunCall : *FwdCalls) {
       MachineRegisterInfo *CallMRI =
           &FunCall->getParent()->getParent()->getRegInfo();
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index f4525e713c98..49749b563453 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1825,7 +1825,24 @@ bool SPIRVInstructionSelector::selectAllocaArray(Register ResVReg,
 bool SPIRVInstructionSelector::selectFrameIndex(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I) const {
-  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
+  // Change order of instructions if needed: all OpVariable instructions in a
+  // function must be the first instructions in the first block
+  MachineFunction *MF = I.getParent()->getParent();
+  MachineBasicBlock *MBB = &MF->front();
+  auto It = MBB->SkipPHIsAndLabels(MBB->begin()), E = MBB->end();
+  bool IsHeader = false;
+  unsigned Opcode;
+  for (; It != E && It != I; ++It) {
+    Opcode = It->getOpcode();
+    if (Opcode == SPIRV::OpFunction || Opcode == SPIRV::OpFunctionParameter) {
+      IsHeader = true;
+    } else if (IsHeader &&
+               !(Opcode == SPIRV::ASSIGN_TYPE || Opcode == SPIRV::OpLabel)) {
+      ++It;
+      break;
+    }
+  }
+  return BuildMI(*MBB, It, It->getDebugLoc(), TII.get(SPIRV::OpVariable))
       .addDef(ResVReg)
       .addUse(GR.getSPIRVTypeID(ResType))
       .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function))
diff --git a/llvm/test/CodeGen/SPIRV/OpVariable_order.ll b/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
index a4ca3aa709f0..6057bf38d4c4 100644
--- a/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
+++ b/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
@@ -1,10 +1,14 @@
-; REQUIRES: spirv-tools
-; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - -filetype=obj | not spirv-val 2>&1 | FileCheck %s
+; All OpVariable instructions in a function must be the first instructions in the first block
 
-; TODO(#66261): The SPIR-V backend should reorder OpVariable instructions so this doesn't fail,
-;     but in the meantime it's a good example of the spirv-val tool working as intended.
+; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-linux %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: All OpVariable instructions in a function must be the first instructions in the first block.
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV-NEXT: OpLabel
+; CHECK-SPIRV-NEXT: OpVariable
+; CHECK-SPIRV-NEXT: OpVariable
+; CHECK-SPIRV: OpReturn
+; CHECK-SPIRV: OpFunctionEnd
 
 define void @main() #1 {
 entry:
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
index 1071d3443056..b039f80860da 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
@@ -10,22 +10,46 @@
 ; CHECK-SPIRV-DAG: OpName %[[FooObj:.*]] "foo_object"
 ; CHECK-SPIRV-DAG: OpName %[[FooMemOrder:.*]] "mem_order"
 ; CHECK-SPIRV-DAG: OpName %[[FooFunc:.*]] "foo"
+
 ; CHECK-SPIRV-DAG: %[[TyLong:.*]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[TyGenPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
 ; CHECK-SPIRV-DAG: %[[TyPtrLong:.*]] = OpTypePointer CrossWorkgroup %[[TyLong]]
 ; CHECK-SPIRV-DAG: %[[TyFunPtrLong:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtrLong]]
-; CHECK-SPIRV-DAG: %[[TyGenPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrLong:.*]] = OpTypePointer Generic %[[TyGenPtrLong]]
 ; CHECK-SPIRV-DAG: %[[TyFunGenPtrLongLong:.*]] = OpTypeFunction %[[TyVoid]] %[[TyGenPtrLong]] %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[TyChar:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[TyGenPtrChar:.*]] = OpTypePointer Generic %[[TyChar]]
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrChar:.*]] = OpTypePointer Generic %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyFunPtrGenPtrChar:.*]] = OpTypePointer Function %[[TyGenPtrChar]]
 ; CHECK-SPIRV-DAG: %[[Const3:.*]] = OpConstant %[[TyLong]] 3
+
 ; CHECK-SPIRV: %[[FunTest]] = OpFunction %[[TyVoid]] None %[[TyFunPtrLong]]
 ; CHECK-SPIRV: %[[ArgCum]] = OpFunctionParameter %[[TyPtrLong]]
+
 ; CHECK-SPIRV: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[Addr]] %[[Const3]]
+
+; CHECK-SPIRV: %[[HalfAddr:.*]] = OpPtrCastToGeneric
+; CHECK-SPIRV-NEXT: %[[HalfAddrCasted:.*]] = OpBitcast %[[TyGenPtrLong]] %[[HalfAddr]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[HalfAddrCasted]] %[[Const3]]
+
+; CHECK-SPIRV: %[[DblAddr:.*]] = OpPtrCastToGeneric
+; CHECK-SPIRV-NEXT: %[[DblAddrCasted:.*]] = OpBitcast %[[TyGenPtrLong]] %[[DblAddr]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[DblAddrCasted]] %[[Const3]]
+
 ; CHECK-SPIRV: %[[FooStub]] = OpFunction %[[TyVoid]] None %[[TyFunGenPtrLongLong]]
 ; CHECK-SPIRV: %[[StubObj]] = OpFunctionParameter %[[TyGenPtrLong]]
 ; CHECK-SPIRV: %[[MemOrder]] = OpFunctionParameter %[[TyLong]]
+
+; CHECK-SPIRV: %[[ObjectAddr:.*]] = OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: %[[ToGeneric:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[ObjectAddr]]
+; CHECK-SPIRV-NEXT: %[[Casted:.*]] = OpBitcast %[[TyGenPtrPtrLong]] %[[ToGeneric]]
+; CHECK-SPIRV-NEXT: OpStore %[[Casted]] %[[StubObj]]
+
 ; CHECK-SPIRV: %[[FooFunc]] = OpFunction %[[TyVoid]] None %[[TyFunGenPtrLongLong]]
 ; CHECK-SPIRV: %[[FooObj]] = OpFunctionParameter %[[TyGenPtrLong]]
 ; CHECK-SPIRV: %[[FooMemOrder]] = OpFunctionParameter %[[TyLong]]
+
 ; CHECK-SPIRV: OpFunctionCall %[[TyVoid]] %[[FooStub]] %[[FooObj]] %[[FooMemOrder]]
 
 define spir_kernel void @test(ptr addrspace(1) noundef align 4 %_arg_cum) {
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll
new file mode 100644
index 000000000000..edb31ffeee8e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll
@@ -0,0 +1,60 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo"
+; CHECK-SPIRV-DAG: %[[TyChar:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[TyGenPtrChar:.*]] = OpTypePointer Generic %[[TyChar]]
+; CHECK-SPIRV-DAG: %[[TyFunBar:.*]] = OpTypeFunction %[[TyVoid]] %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyLong:.*]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrChar:.*]] = OpTypePointer Generic %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyFunFoo:.*]] = OpTypeFunction %[[TyVoid]] %[[TyLong]] %[[TyGenPtrPtrChar]] %[[TyGenPtrPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[Const100:.*]] = OpConstant %[[TyLong]] 100
+; CHECK-SPIRV-DAG: %[[TyFunPtrGenPtrChar:.*]] = OpTypePointer Function %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyPtrStruct:.*]] = OpTypePointer Generic %[[TyStruct]]
+; CHECK-SPIRV-DAG: %[[TyPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
+
+; CHECK-SPIRV: %[[Bar:.*]] = OpFunction %[[TyVoid]] None %[[TyFunBar]]
+; CHECK-SPIRV: %[[BarArg:.*]] = OpFunctionParameter %[[TyGenPtrChar]]
+; CHECK-SPIRV-NEXT: OpLabel
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV: %[[Var1:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[#]]
+; CHECK-SPIRV: %[[Var2:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[#]]
+; CHECK-SPIRV: OpStore %[[#]] %[[BarArg]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[Foo]] %[[Const100]] %[[Var1]] %[[Var2]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[Foo]] %[[Const100]] %[[Var2]] %[[Var1]]
+
+; CHECK-SPIRV: %[[Foo]] = OpFunction %[[TyVoid]] None %[[TyFunFoo]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyLong]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyGenPtrPtrChar]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyGenPtrPtrChar]]
+
+%class.CustomType = type { i64 }
+
+define linkonce_odr dso_local spir_func void @bar(ptr addrspace(4) noundef %first) {
+entry:
+  %first.addr = alloca ptr addrspace(4)
+  %first.addr.ascast = addrspacecast ptr %first.addr to ptr addrspace(4)
+  %temp = alloca ptr addrspace(4), align 8
+  %temp.ascast = addrspacecast ptr %temp to ptr addrspace(4)
+  store ptr addrspace(4) %first, ptr %first.addr
+  call spir_func void @foo(i64 noundef 100, ptr addrspace(4) noundef dereferenceable(8) %first.addr.ascast, ptr addrspace(4) noundef dereferenceable(8) %temp.ascast)
+  call spir_func void @foo(i64 noundef 100, ptr addrspace(4) noundef dereferenceable(8) %temp.ascast, ptr addrspace(4) noundef dereferenceable(8) %first.addr.ascast)
+  %var = alloca ptr addrspace(4), align 8
+  ret void
+}
+
+define linkonce_odr dso_local spir_func void @foo(i64 noundef %offset, ptr addrspace(4) noundef dereferenceable(8) %in_acc1, ptr addrspace(4) noundef dereferenceable(8) %out_acc1) {
+entry:
+  %r0 = load ptr addrspace(4), ptr addrspace(4) %in_acc1
+  %arrayidx = getelementptr inbounds %class.CustomType, ptr addrspace(4) %r0, i64 42
+  %r1 = load i64, ptr addrspace(4) %arrayidx
+  %r3 = load ptr addrspace(4), ptr addrspace(4) %out_acc1
+  %r4 = getelementptr %class.CustomType, ptr addrspace(4) %r3, i64 43
+  store i64 %r1, ptr addrspace(4) %r4
+  ret void
+}
+
-- 
cgit v1.2.3


From 12735916bd3a63aa9f316af8eebfe9420cfec489 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 4 Apr 2024 11:10:30 +0200
Subject: Revert "[libc] Refactor `BigInt`" (#87612)

Reverts llvm/llvm-project#86137

Some aarch64 compilers seem to consider that `uint128_t` is not
`is_trivially_constructible` which prevents `bit_cast`-ing.
---
 libc/fuzzing/CMakeLists.txt                        |    1 -
 libc/fuzzing/__support/CMakeLists.txt              |    7 -
 libc/fuzzing/__support/uint_fuzz.cpp               |   70 --
 libc/src/__support/FPUtil/dyadic_float.h           |    6 +-
 libc/src/__support/UInt.h                          | 1126 ++++++++++----------
 libc/src/__support/float_to_string.h               |    7 +-
 libc/src/__support/integer_literals.h              |   25 +-
 libc/src/__support/math_extras.h                   |  249 +++--
 libc/src/__support/number_pair.h                   |   11 +
 libc/test/src/__support/integer_literals_test.cpp  |   21 -
 libc/test/src/__support/math_extras_test.cpp       |   57 -
 libc/test/src/__support/uint_test.cpp              |  192 +---
 .../libc/test/src/__support/BUILD.bazel            |    1 -
 13 files changed, 762 insertions(+), 1011 deletions(-)
 delete mode 100644 libc/fuzzing/__support/CMakeLists.txt
 delete mode 100644 libc/fuzzing/__support/uint_fuzz.cpp

diff --git a/libc/fuzzing/CMakeLists.txt b/libc/fuzzing/CMakeLists.txt
index 816691b4bd44..82487688af11 100644
--- a/libc/fuzzing/CMakeLists.txt
+++ b/libc/fuzzing/CMakeLists.txt
@@ -1,7 +1,6 @@
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer")
 add_custom_target(libc-fuzzer)
 
-add_subdirectory(__support)
 # TODO(#85680): Re-enable math fuzzing after headers are sorted out
 # add_subdirectory(math)
 add_subdirectory(stdlib)
diff --git a/libc/fuzzing/__support/CMakeLists.txt b/libc/fuzzing/__support/CMakeLists.txt
deleted file mode 100644
index 278e914e3fbe..000000000000
--- a/libc/fuzzing/__support/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_libc_fuzzer(
-  uint_fuzz
-  SRCS
-    uint_fuzz.cpp
-  DEPENDS
-    libc.src.__support.uint
-)
diff --git a/libc/fuzzing/__support/uint_fuzz.cpp b/libc/fuzzing/__support/uint_fuzz.cpp
deleted file mode 100644
index f48f00d3b4ba..000000000000
--- a/libc/fuzzing/__support/uint_fuzz.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-#include "src/__support/CPP/bit.h"
-#include "src/__support/UInt.h"
-#include "src/string/memory_utils/inline_memcpy.h"
-
-using namespace LIBC_NAMESPACE;
-
-// Helper function when using gdb / lldb to set a breakpoint and inspect values.
-template <typename T> void debug_and_trap(const char *msg, T a, T b) {
-  __builtin_trap();
-}
-
-#define DEBUG_AND_TRAP()
-
-#define TEST_BINOP(OP)                                                         \
-  if ((a OP b) != (static_cast<T>(BigInt(a) OP BigInt(b))))                    \
-    debug_and_trap(#OP, a, b);
-
-#define TEST_SHIFTOP(OP)                                                       \
-  if ((a OP b) != (static_cast<T>(BigInt(a) OP b)))                            \
-    debug_and_trap(#OP, a, b);
-
-#define TEST_FUNCTION(FUN)                                                     \
-  if (FUN(a) != FUN(BigInt(a)))                                                \
-    debug_and_trap(#FUN, a, b);
-
-// Test that basic arithmetic operations of BigInt behave like their scalar
-// counterparts.
-template <typename T, typename BigInt> void run_tests(T a, T b) {
-  TEST_BINOP(+)
-  TEST_BINOP(-)
-  TEST_BINOP(*)
-  if (b != 0)
-    TEST_BINOP(/)
-  if (b >= 0 && b < cpp::numeric_limits<T>::digits) {
-    TEST_SHIFTOP(<<)
-    TEST_SHIFTOP(>>)
-  }
-  if constexpr (!BigInt::SIGNED) {
-    TEST_FUNCTION(cpp::has_single_bit)
-    TEST_FUNCTION(cpp::countr_zero)
-    TEST_FUNCTION(cpp::countl_zero)
-    TEST_FUNCTION(cpp::countl_one)
-    TEST_FUNCTION(cpp::countr_one)
-  }
-}
-
-// Reads a T from libfuzzer data.
-template <typename T> T read(const uint8_t *data, size_t &remainder) {
-  T out = 0;
-  constexpr size_t T_SIZE = sizeof(T);
-  const size_t copy_size = remainder < T_SIZE ? remainder : T_SIZE;
-  inline_memcpy(&out, data, copy_size);
-  remainder -= copy_size;
-  return out;
-}
-
-template <typename T, typename BigInt>
-void run_tests(const uint8_t *data, size_t size) {
-  const auto a = read<T>(data, size);
-  const auto b = read<T>(data, size);
-  run_tests<T, BigInt>(a, b);
-}
-
-extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
-  // unsigned
-  run_tests<uint64_t, BigInt<64, false, uint16_t>>(data, size);
-  // signed
-  run_tests<int64_t, BigInt<64, true, uint16_t>>(data, size);
-  return 0;
-}
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index e0c205f52383..73fd7381c3c8 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -58,9 +58,9 @@ template <size_t Bits> struct DyadicFloat {
   // significant bit.
   LIBC_INLINE constexpr DyadicFloat &normalize() {
     if (!mantissa.is_zero()) {
-      int shift_length = cpp::countl_zero(mantissa);
+      int shift_length = static_cast<int>(mantissa.clz());
       exponent -= shift_length;
-      mantissa <<= static_cast<size_t>(shift_length);
+      mantissa.shift_left(static_cast<size_t>(shift_length));
     }
     return *this;
   }
@@ -233,7 +233,7 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
     result.sign = a.sign;
     result.exponent = a.exponent;
     result.mantissa = a.mantissa;
-    if (result.mantissa.add_overflow(b.mantissa)) {
+    if (result.mantissa.add(b.mantissa)) {
       // Mantissa addition overflow.
       result.shift_right(1);
       result.mantissa.val[DyadicFloat<Bits>::MantissaType::WORD_COUNT - 1] |=
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index c524de38d986..282efdba1c5f 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -14,11 +14,10 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/macros/attributes.h"          // LIBC_INLINE
-#include "src/__support/macros/optimization.h"        // LIBC_UNLIKELY
-#include "src/__support/macros/properties/compiler.h" // LIBC_COMPILER_IS_CLANG
+#include "src/__support/macros/attributes.h"   // LIBC_INLINE
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128, LIBC_TYPES_HAS_INT64
-#include "src/__support/math_extras.h" // add_with_carry, sub_with_borrow
+#include "src/__support/math_extras.h" // SumCarry, DiffBorrow
 #include "src/__support/number_pair.h"
 
 #include <stddef.h> // For size_t
@@ -26,321 +25,71 @@
 
 namespace LIBC_NAMESPACE {
 
-namespace multiword {
-
-// A type trait mapping unsigned integers to their half-width unsigned
-// counterparts.
+namespace internal {
 template <typename T> struct half_width;
-template <> struct half_width<uint16_t> : cpp::type_identity<uint8_t> {};
-template <> struct half_width<uint32_t> : cpp::type_identity<uint16_t> {};
-#ifdef LIBC_TYPES_HAS_INT64
+
 template <> struct half_width<uint64_t> : cpp::type_identity<uint32_t> {};
+template <> struct half_width<uint32_t> : cpp::type_identity<uint16_t> {};
+template <> struct half_width<uint16_t> : cpp::type_identity<uint8_t> {};
 #ifdef LIBC_TYPES_HAS_INT128
 template <> struct half_width<__uint128_t> : cpp::type_identity<uint64_t> {};
 #endif // LIBC_TYPES_HAS_INT128
-#endif // LIBC_TYPES_HAS_INT64
-template <typename T> using half_width_t = typename half_width<T>::type;
-
-// An array of two elements that can be used in multiword operations.
-template <typename T> struct DoubleWide final : cpp::array<T, 2> {
-  using UP = cpp::array<T, 2>;
-  using UP::UP;
-  LIBC_INLINE constexpr DoubleWide(T lo, T hi) : UP({lo, hi}) {}
-};
-
-// Converts an unsigned value into a DoubleWide<half_width_t<T>>.
-template <typename T> LIBC_INLINE constexpr auto split(T value) {
-  static_assert(cpp::is_unsigned_v<T>);
-  return cpp::bit_cast<DoubleWide<half_width_t<T>>>(value);
-}
-
-// The low part of a DoubleWide value.
-template <typename T> LIBC_INLINE constexpr T lo(const DoubleWide<T> &value) {
-  return value[0];
-}
-// The high part of a DoubleWide value.
-template <typename T> LIBC_INLINE constexpr T hi(const DoubleWide<T> &value) {
-  return value[1];
-}
-// The low part of an unsigned value.
-template <typename T> LIBC_INLINE constexpr half_width_t<T> lo(T value) {
-  return lo(split(value));
-}
-// The high part of an unsigned value.
-template <typename T> LIBC_INLINE constexpr half_width_t<T> hi(T value) {
-  return hi(split(value));
-}
-
-// Returns 'a' times 'b' in a DoubleWide<word>. Cannot overflow by construction.
-template <typename word>
-LIBC_INLINE constexpr DoubleWide<word> mul2(word a, word b) {
-  if constexpr (cpp::is_same_v<word, uint8_t>) {
-    return split<uint16_t>(uint16_t(a) * uint16_t(b));
-  } else if constexpr (cpp::is_same_v<word, uint16_t>) {
-    return split<uint32_t>(uint32_t(a) * uint32_t(b));
-  }
-#ifdef LIBC_TYPES_HAS_INT64
-  else if constexpr (cpp::is_same_v<word, uint32_t>) {
-    return split<uint64_t>(uint64_t(a) * uint64_t(b));
-  }
-#endif
-#ifdef LIBC_TYPES_HAS_INT128
-  else if constexpr (cpp::is_same_v<word, uint64_t>) {
-    return split<__uint128_t>(__uint128_t(a) * __uint128_t(b));
-  }
-#endif
-  else {
-    using half_word = half_width_t<word>;
-    const auto shiftl = [](word value) -> word {
-      return value << cpp::numeric_limits<half_word>::digits;
-    };
-    const auto shiftr = [](word value) -> word {
-      return value >> cpp::numeric_limits<half_word>::digits;
-    };
-    // Here we do a one digit multiplication where 'a' and 'b' are of type
-    // word. We split 'a' and 'b' into half words and perform the classic long
-    // multiplication with 'a' and 'b' being two-digit numbers.
-
-    //    a      a_hi a_lo
-    //  x b => x b_hi b_lo
-    // ----    -----------
-    //    c         result
-    // We convert 'lo' and 'hi' from 'half_word' to 'word' so multiplication
-    // doesn't overflow.
-    const word a_lo = lo(a);
-    const word b_lo = lo(b);
-    const word a_hi = hi(a);
-    const word b_hi = hi(b);
-    const word step1 = b_lo * a_lo; // no overflow;
-    const word step2 = b_lo * a_hi; // no overflow;
-    const word step3 = b_hi * a_lo; // no overflow;
-    const word step4 = b_hi * a_hi; // no overflow;
-    word lo_digit = step1;
-    word hi_digit = step4;
-    const word no_carry = 0;
-    word carry;
-    word _; // unused carry variable.
-    lo_digit = add_with_carry<word>(lo_digit, shiftl(step2), no_carry, carry);
-    hi_digit = add_with_carry<word>(hi_digit, shiftr(step2), carry, _);
-    lo_digit = add_with_carry<word>(lo_digit, shiftl(step3), no_carry, carry);
-    hi_digit = add_with_carry<word>(hi_digit, shiftr(step3), carry, _);
-    return DoubleWide<word>(lo_digit, hi_digit);
-  }
-}
-
-// In-place 'dst op= rhs' with operation with carry propagation. Returns carry.
-template <typename Function, typename word, size_t N, size_t M>
-LIBC_INLINE constexpr word inplace_binop(Function op_with_carry,
-                                         cpp::array<word, N> &dst,
-                                         const cpp::array<word, M> &rhs) {
-  static_assert(N >= M);
-  word carry_out = 0;
-  for (size_t i = 0; i < N; ++i) {
-    const bool has_rhs_value = i < M;
-    const word rhs_value = has_rhs_value ? rhs[i] : 0;
-    const word carry_in = carry_out;
-    dst[i] = op_with_carry(dst[i], rhs_value, carry_in, carry_out);
-    // stop early when rhs is over and no carry is to be propagated.
-    if (!has_rhs_value && carry_out == 0)
-      break;
-  }
-  return carry_out;
-}
 
-// In-place addition. Returns carry.
-template <typename word, size_t N, size_t M>
-LIBC_INLINE constexpr word add_with_carry(cpp::array<word, N> &dst,
-                                          const cpp::array<word, M> &rhs) {
-  return inplace_binop(LIBC_NAMESPACE::add_with_carry<word>, dst, rhs);
-}
+template <typename T> using half_width_t = typename half_width<T>::type;
 
-// In-place subtraction. Returns borrow.
-template <typename word, size_t N, size_t M>
-LIBC_INLINE constexpr word sub_with_borrow(cpp::array<word, N> &dst,
-                                           const cpp::array<word, M> &rhs) {
-  return inplace_binop(LIBC_NAMESPACE::sub_with_borrow<word>, dst, rhs);
-}
+template <typename T> constexpr NumberPair<T> full_mul(T a, T b) {
+  NumberPair<T> pa = split(a);
+  NumberPair<T> pb = split(b);
+  NumberPair<T> prod;
 
-// In-place multiply-add. Returns carry.
-// i.e., 'dst += b * c'
-template <typename word, size_t N>
-LIBC_INLINE constexpr word mul_add_with_carry(cpp::array<word, N> &dst, word b,
-                                              word c) {
-  return add_with_carry(dst, mul2(b, c));
-}
+  prod.lo = pa.lo * pb.lo;                    // exact
+  prod.hi = pa.hi * pb.hi;                    // exact
+  NumberPair<T> lo_hi = split(pa.lo * pb.hi); // exact
+  NumberPair<T> hi_lo = split(pa.hi * pb.lo); // exact
 
-// An array of two elements serving as an accumulator during multiword
-// computations.
-template <typename T> struct Accumulator final : cpp::array<T, 2> {
-  using UP = cpp::array<T, 2>;
-  LIBC_INLINE constexpr Accumulator() : UP({0, 0}) {}
-  LIBC_INLINE constexpr T advance(T carry_in) {
-    auto result = UP::front();
-    UP::front() = UP::back();
-    UP::back() = carry_in;
-    return result;
-  }
-  LIBC_INLINE constexpr T sum() const { return UP::front(); }
-  LIBC_INLINE constexpr T carry() const { return UP::back(); }
-};
+  constexpr size_t HALF_BIT_WIDTH = sizeof(T) * CHAR_BIT / 2;
 
-// In-place multiplication by a single word. Returns carry.
-template <typename word, size_t N>
-LIBC_INLINE constexpr word scalar_multiply_with_carry(cpp::array<word, N> &dst,
-                                                      word x) {
-  Accumulator<word> acc;
-  for (auto &val : dst) {
-    const word carry = mul_add_with_carry(acc, val, x);
-    val = acc.advance(carry);
-  }
-  return acc.carry();
-}
+  auto r1 = add_with_carry(prod.lo, lo_hi.lo << HALF_BIT_WIDTH, T(0));
+  prod.lo = r1.sum;
+  prod.hi = add_with_carry(prod.hi, lo_hi.hi, r1.carry).sum;
 
-// Multiplication of 'lhs' by 'rhs' into 'dst'. Returns carry.
-// This function is safe to use for signed numbers.
-// https://stackoverflow.com/a/20793834
-// https://pages.cs.wisc.edu/%7Emarkhill/cs354/Fall2008/beyond354/int.mult.html
-template <typename word, size_t O, size_t M, size_t N>
-LIBC_INLINE constexpr word multiply_with_carry(cpp::array<word, O> &dst,
-                                               const cpp::array<word, M> &lhs,
-                                               const cpp::array<word, N> &rhs) {
-  static_assert(O >= M + N);
-  Accumulator<word> acc;
-  for (size_t i = 0; i < O; ++i) {
-    const size_t lower_idx = i < N ? 0 : i - N + 1;
-    const size_t upper_idx = i < M ? i : M - 1;
-    word carry = 0;
-    for (size_t j = lower_idx; j <= upper_idx; ++j)
-      carry += mul_add_with_carry(acc, lhs[j], rhs[i - j]);
-    dst[i] = acc.advance(carry);
-  }
-  return acc.carry();
-}
+  auto r2 = add_with_carry(prod.lo, hi_lo.lo << HALF_BIT_WIDTH, T(0));
+  prod.lo = r2.sum;
+  prod.hi = add_with_carry(prod.hi, hi_lo.hi, r2.carry).sum;
 
-template <typename word, size_t N>
-LIBC_INLINE constexpr void quick_mul_hi(cpp::array<word, N> &dst,
-                                        const cpp::array<word, N> &lhs,
-                                        const cpp::array<word, N> &rhs) {
-  Accumulator<word> acc;
-  word carry = 0;
-  // First round of accumulation for those at N - 1 in the full product.
-  for (size_t i = 0; i < N; ++i)
-    carry += mul_add_with_carry(acc, lhs[i], rhs[N - 1 - i]);
-  for (size_t i = N; i < 2 * N - 1; ++i) {
-    acc.advance(carry);
-    carry = 0;
-    for (size_t j = i - N + 1; j < N; ++j)
-      carry += mul_add_with_carry(acc, lhs[j], rhs[i - j]);
-    dst[i - N] = acc.sum();
-  }
-  dst.back() = acc.carry();
+  return prod;
 }
 
-template <typename word, size_t N>
-LIBC_INLINE constexpr bool is_negative(cpp::array<word, N> &array) {
-  using signed_word = cpp::make_signed_t<word>;
-  return cpp::bit_cast<signed_word>(array.back()) < 0;
+template <>
+LIBC_INLINE constexpr NumberPair<uint32_t> full_mul<uint32_t>(uint32_t a,
+                                                              uint32_t b) {
+  uint64_t prod = uint64_t(a) * uint64_t(b);
+  NumberPair<uint32_t> result;
+  result.lo = uint32_t(prod);
+  result.hi = uint32_t(prod >> 32);
+  return result;
 }
 
-// An enum for the shift function below.
-enum Direction { LEFT, RIGHT };
-
-// A bitwise shift on an array of elements.
-// TODO: Make the result UB when 'offset' is greater or equal to the number of
-// bits in 'array'. This will allow for better code performance.
-template <Direction direction, bool is_signed, typename word, size_t N>
-LIBC_INLINE constexpr cpp::array<word, N> shift(cpp::array<word, N> array,
-                                                size_t offset) {
-  static_assert(direction == LEFT || direction == RIGHT);
-  constexpr size_t WORD_BITS = cpp::numeric_limits<word>::digits;
-  constexpr size_t TOTAL_BITS = N * WORD_BITS;
-  if (LIBC_UNLIKELY(offset == 0))
-    return array;
-  if (LIBC_UNLIKELY(offset >= TOTAL_BITS))
-    return {};
 #ifdef LIBC_TYPES_HAS_INT128
-  if constexpr (TOTAL_BITS == 128) {
-    using type = cpp::conditional_t<is_signed, __int128_t, __uint128_t>;
-    auto tmp = cpp::bit_cast<type>(array);
-    if constexpr (direction == LEFT)
-      tmp <<= offset;
-    else
-      tmp >>= offset;
-    return cpp::bit_cast<cpp::array<word, N>>(tmp);
-  }
-#endif
-  const bool is_neg = is_signed && is_negative(array);
-  constexpr auto at = [](size_t index) -> int {
-    // reverse iteration when direction == LEFT.
-    if constexpr (direction == LEFT)
-      return int(N) - int(index) - 1;
-    return int(index);
-  };
-  const auto safe_get_at = [&](size_t index) -> word {
-    // return appropriate value when accessing out of bound elements.
-    const int i = at(index);
-    if (i < 0)
-      return 0;
-    if (i >= int(N))
-      return is_neg ? -1 : 0;
-    return array[i];
-  };
-  const size_t index_offset = offset / WORD_BITS;
-  const size_t bit_offset = offset % WORD_BITS;
-#ifdef LIBC_COMPILER_IS_CLANG
-  __builtin_assume(index_offset < N);
-#endif
-  cpp::array<word, N> out = {};
-  for (size_t index = 0; index < N; ++index) {
-    const word part1 = safe_get_at(index + index_offset);
-    const word part2 = safe_get_at(index + index_offset + 1);
-    word &dst = out[at(index)];
-    if (bit_offset == 0)
-      dst = part1; // no crosstalk between parts.
-    else if constexpr (direction == LEFT)
-      dst = (part1 << bit_offset) | (part2 >> (WORD_BITS - bit_offset));
-    else
-      dst = (part1 >> bit_offset) | (part2 << (WORD_BITS - bit_offset));
-  }
-  return out;
+template <>
+LIBC_INLINE constexpr NumberPair<uint64_t> full_mul<uint64_t>(uint64_t a,
+                                                              uint64_t b) {
+  __uint128_t prod = __uint128_t(a) * __uint128_t(b);
+  NumberPair<uint64_t> result;
+  result.lo = uint64_t(prod);
+  result.hi = uint64_t(prod >> 64);
+  return result;
 }
+#endif // LIBC_TYPES_HAS_INT128
 
-#define DECLARE_COUNTBIT(NAME, INDEX_EXPR)                                     \
-  template <typename word, size_t N>                                           \
-  LIBC_INLINE constexpr int NAME(const cpp::array<word, N> &val) {             \
-    int bit_count = 0;                                                         \
-    for (size_t i = 0; i < N; ++i) {                                           \
-      const int word_count = cpp::NAME<word>(val[INDEX_EXPR]);                 \
-      bit_count += word_count;                                                 \
-      if (word_count != cpp::numeric_limits<word>::digits)                     \
-        break;                                                                 \
-    }                                                                          \
-    return bit_count;                                                          \
-  }
-
-DECLARE_COUNTBIT(countr_zero, i)         // iterating forward
-DECLARE_COUNTBIT(countr_one, i)          // iterating forward
-DECLARE_COUNTBIT(countl_zero, N - i - 1) // iterating backward
-DECLARE_COUNTBIT(countl_one, N - i - 1)  // iterating backward
-
-} // namespace multiword
+} // namespace internal
 
 template <size_t Bits, bool Signed, typename WordType = uint64_t>
 struct BigInt {
-private:
   static_assert(cpp::is_integral_v<WordType> && cpp::is_unsigned_v<WordType>,
                 "WordType must be unsigned integer.");
 
-  struct Division {
-    BigInt quotient;
-    BigInt remainder;
-  };
-
-public:
   using word_type = WordType;
-  using unsigned_type = BigInt<Bits, false, word_type>;
-  using signed_type = BigInt<Bits, true, word_type>;
-
   LIBC_INLINE_VAR static constexpr bool SIGNED = Signed;
   LIBC_INLINE_VAR static constexpr size_t BITS = Bits;
   LIBC_INLINE_VAR
@@ -351,7 +100,10 @@ public:
 
   LIBC_INLINE_VAR static constexpr size_t WORD_COUNT = Bits / WORD_SIZE;
 
-  cpp::array<WordType, WORD_COUNT> val{}; // zero initialized.
+  using unsigned_type = BigInt<BITS, false, word_type>;
+  using signed_type = BigInt<BITS, true, word_type>;
+
+  cpp::array<WordType, WORD_COUNT> val{};
 
   LIBC_INLINE constexpr BigInt() = default;
 
@@ -360,67 +112,76 @@ public:
   template <size_t OtherBits, bool OtherSigned>
   LIBC_INLINE constexpr BigInt(
       const BigInt<OtherBits, OtherSigned, WordType> &other) {
-    if (OtherBits >= Bits) { // truncate
+    if (OtherBits >= Bits) {
       for (size_t i = 0; i < WORD_COUNT; ++i)
         val[i] = other[i];
-    } else { // zero or sign extend
+    } else {
       size_t i = 0;
       for (; i < OtherBits / WORD_SIZE; ++i)
         val[i] = other[i];
-      extend(i, Signed && other.is_neg());
+      WordType sign = 0;
+      if constexpr (Signed && OtherSigned) {
+        sign = static_cast<WordType>(
+            -static_cast<cpp::make_signed_t<WordType>>(other.is_neg()));
+      }
+      for (; i < WORD_COUNT; ++i)
+        val[i] = sign;
     }
   }
 
   // Construct a BigInt from a C array.
-  template <size_t N> LIBC_INLINE constexpr BigInt(const WordType (&nums)[N]) {
-    static_assert(N == WORD_COUNT);
-    for (size_t i = 0; i < WORD_COUNT; ++i)
+  template <size_t N, cpp::enable_if_t<N <= WORD_COUNT, int> = 0>
+  LIBC_INLINE constexpr BigInt(const WordType (&nums)[N]) {
+    size_t min_wordcount = N < WORD_COUNT ? N : WORD_COUNT;
+    size_t i = 0;
+    for (; i < min_wordcount; ++i)
       val[i] = nums[i];
-  }
 
-  LIBC_INLINE constexpr explicit BigInt(
-      const cpp::array<WordType, WORD_COUNT> &words) {
-    val = words;
+    // If nums doesn't completely fill val, then fill the rest with zeroes.
+    for (; i < WORD_COUNT; ++i)
+      val[i] = 0;
   }
 
   // Initialize the first word to |v| and the rest to 0.
   template <typename T, typename = cpp::enable_if_t<cpp::is_integral_v<T>>>
   LIBC_INLINE constexpr BigInt(T v) {
-    constexpr size_t T_SIZE = sizeof(T) * CHAR_BIT;
-    const bool is_neg = Signed && (v < 0);
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      if (v == 0) {
-        extend(i, is_neg);
-        return;
+    val[0] = static_cast<WordType>(v);
+
+    if constexpr (WORD_COUNT == 1)
+      return;
+
+    if constexpr (Bits < sizeof(T) * CHAR_BIT) {
+      for (int i = 1; i < WORD_COUNT; ++i) {
+        v >>= WORD_SIZE;
+        val[i] = static_cast<WordType>(v);
       }
-      val[i] = static_cast<WordType>(v);
-      if constexpr (T_SIZE > WORD_SIZE)
+      return;
+    }
+
+    size_t i = 1;
+
+    if constexpr (WORD_SIZE < sizeof(T) * CHAR_BIT)
+      for (; i < sizeof(T) * CHAR_BIT / WORD_SIZE; ++i) {
         v >>= WORD_SIZE;
-      else
-        v = 0;
+        val[i] = static_cast<WordType>(v);
+      }
+
+    WordType sign = (Signed && (v < 0)) ? ~WordType(0) : WordType(0);
+    for (; i < WORD_COUNT; ++i) {
+      val[i] = sign;
     }
   }
-  LIBC_INLINE constexpr BigInt &operator=(const BigInt &other) = default;
 
-  // constants
-  LIBC_INLINE static constexpr BigInt zero() { return BigInt(); }
-  LIBC_INLINE static constexpr BigInt one() { return BigInt(1); }
-  LIBC_INLINE static constexpr BigInt all_ones() { return ~zero(); }
-  LIBC_INLINE static constexpr BigInt min() {
-    BigInt out;
-    if constexpr (SIGNED)
-      out.set_msb();
-    return out;
-  }
-  LIBC_INLINE static constexpr BigInt max() {
-    BigInt out = all_ones();
-    if constexpr (SIGNED)
-      out.clear_msb();
-    return out;
+  LIBC_INLINE constexpr explicit BigInt(
+      const cpp::array<WordType, WORD_COUNT> &words) {
+    for (size_t i = 0; i < WORD_COUNT; ++i)
+      val[i] = words[i];
   }
 
   // TODO: Reuse the Sign type.
-  LIBC_INLINE constexpr bool is_neg() const { return SIGNED && get_msb(); }
+  LIBC_INLINE constexpr bool is_neg() const {
+    return val.back() >> (WORD_SIZE - 1);
+  }
 
   template <typename T> LIBC_INLINE constexpr explicit operator T() const {
     return to<T>();
@@ -430,100 +191,200 @@ public:
   LIBC_INLINE constexpr cpp::enable_if_t<
       cpp::is_integral_v<T> && !cpp::is_same_v<T, bool>, T>
   to() const {
-    constexpr size_t T_SIZE = sizeof(T) * CHAR_BIT;
     T lo = static_cast<T>(val[0]);
-    if constexpr (T_SIZE <= WORD_SIZE)
+
+    constexpr size_t T_BITS = sizeof(T) * CHAR_BIT;
+
+    if constexpr (T_BITS <= WORD_SIZE)
       return lo;
+
     constexpr size_t MAX_COUNT =
-        T_SIZE > Bits ? WORD_COUNT : T_SIZE / WORD_SIZE;
+        T_BITS > Bits ? WORD_COUNT : T_BITS / WORD_SIZE;
     for (size_t i = 1; i < MAX_COUNT; ++i)
       lo += static_cast<T>(val[i]) << (WORD_SIZE * i);
-    if constexpr (Signed && (T_SIZE > Bits)) {
+
+    if constexpr (Signed && (T_BITS > Bits)) {
       // Extend sign for negative numbers.
       constexpr T MASK = (~T(0) << Bits);
       if (is_neg())
         lo |= MASK;
     }
+
     return lo;
   }
 
   LIBC_INLINE constexpr explicit operator bool() const { return !is_zero(); }
 
+  LIBC_INLINE constexpr BigInt &operator=(const BigInt &other) = default;
+
   LIBC_INLINE constexpr bool is_zero() const {
-    for (auto part : val)
-      if (part != 0)
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      if (val[i] != 0)
         return false;
+    }
     return true;
   }
 
-  // Add 'rhs' to this number and store the result in this number.
+  // Add x to this number and store the result in this number.
   // Returns the carry value produced by the addition operation.
-  LIBC_INLINE constexpr WordType add_overflow(const BigInt &rhs) {
-    return multiword::add_with_carry(val, rhs.val);
+  LIBC_INLINE constexpr WordType add(const BigInt &x) {
+    SumCarry<WordType> s{0, 0};
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      s = add_with_carry(val[i], x.val[i], s.carry);
+      val[i] = s.sum;
+    }
+    return s.carry;
   }
 
   LIBC_INLINE constexpr BigInt operator+(const BigInt &other) const {
-    BigInt result = *this;
-    result.add_overflow(other);
+    BigInt result;
+    SumCarry<WordType> s{0, 0};
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      s = add_with_carry(val[i], other.val[i], s.carry);
+      result.val[i] = s.sum;
+    }
     return result;
   }
 
   // This will only apply when initializing a variable from constant values, so
   // it will always use the constexpr version of add_with_carry.
   LIBC_INLINE constexpr BigInt operator+(BigInt &&other) const {
-    // We use addition commutativity to reuse 'other' and prevent allocation.
-    other.add_overflow(*this); // Returned carry value is ignored.
-    return other;
+    BigInt result;
+    SumCarry<WordType> s{0, 0};
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      s = add_with_carry(val[i], other.val[i], s.carry);
+      result.val[i] = s.sum;
+    }
+    return result;
   }
 
   LIBC_INLINE constexpr BigInt &operator+=(const BigInt &other) {
-    add_overflow(other); // Returned carry value is ignored.
+    add(other); // Returned carry value is ignored.
     return *this;
   }
 
-  // Subtract 'rhs' to this number and store the result in this number.
+  // Subtract x to this number and store the result in this number.
   // Returns the carry value produced by the subtraction operation.
-  LIBC_INLINE constexpr WordType sub_overflow(const BigInt &rhs) {
-    return multiword::sub_with_borrow(val, rhs.val);
+  LIBC_INLINE constexpr WordType sub(const BigInt &x) {
+    DiffBorrow<WordType> d{0, 0};
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      d = sub_with_borrow(val[i], x.val[i], d.borrow);
+      val[i] = d.diff;
+    }
+    return d.borrow;
   }
 
   LIBC_INLINE constexpr BigInt operator-(const BigInt &other) const {
-    BigInt result = *this;
-    result.sub_overflow(other); // Returned carry value is ignored.
+    BigInt result;
+    DiffBorrow<WordType> d{0, 0};
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      d = sub_with_borrow(val[i], other.val[i], d.borrow);
+      result.val[i] = d.diff;
+    }
     return result;
   }
 
   LIBC_INLINE constexpr BigInt operator-(BigInt &&other) const {
-    BigInt result = *this;
-    result.sub_overflow(other); // Returned carry value is ignored.
+    BigInt result;
+    DiffBorrow<WordType> d{0, 0};
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      d = sub_with_borrow(val[i], other.val[i], d.borrow);
+      result.val[i] = d.diff;
+    }
     return result;
   }
 
   LIBC_INLINE constexpr BigInt &operator-=(const BigInt &other) {
     // TODO(lntue): Set overflow flag / errno when carry is true.
-    sub_overflow(other); // Returned carry value is ignored.
+    sub(other);
     return *this;
   }
 
-  // Multiply this number with x and store the result in this number.
+  // Multiply this number with x and store the result in this number. It is
+  // implemented using the long multiplication algorithm by splitting the
+  // 64-bit words of this number and |x| in to 32-bit halves but peforming
+  // the operations using 64-bit numbers. This ensures that we don't lose the
+  // carry bits.
+  // Returns the carry value produced by the multiplication operation.
   LIBC_INLINE constexpr WordType mul(WordType x) {
-    return multiword::scalar_multiply_with_carry(val, x);
+    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      NumberPair<WordType> prod = internal::full_mul(val[i], x);
+      BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
+      const WordType carry = partial_sum.add(tmp);
+      val[i] = partial_sum.val[0];
+      partial_sum.val[0] = partial_sum.val[1];
+      partial_sum.val[1] = carry;
+    }
+    return partial_sum.val[1];
+  }
+
+  LIBC_INLINE constexpr BigInt operator*(const BigInt &other) const {
+    if constexpr (Signed) {
+      BigInt<Bits, false, WordType> a(*this);
+      BigInt<Bits, false, WordType> b(other);
+      const bool a_neg = a.is_neg();
+      const bool b_neg = b.is_neg();
+      if (a_neg)
+        a = -a;
+      if (b_neg)
+        b = -b;
+      BigInt<Bits, false, WordType> prod = a * b;
+      if (a_neg != b_neg)
+        prod = -prod;
+      return static_cast<BigInt<Bits, true, WordType>>(prod);
+    } else {
+      if constexpr (WORD_COUNT == 1) {
+        return {val[0] * other.val[0]};
+      } else {
+        BigInt result(0);
+        BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
+        WordType carry = 0;
+        for (size_t i = 0; i < WORD_COUNT; ++i) {
+          for (size_t j = 0; j <= i; j++) {
+            NumberPair<WordType> prod =
+                internal::full_mul(val[j], other.val[i - j]);
+            BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
+            carry += partial_sum.add(tmp);
+          }
+          result.val[i] = partial_sum.val[0];
+          partial_sum.val[0] = partial_sum.val[1];
+          partial_sum.val[1] = carry;
+          carry = 0;
+        }
+        return result;
+      }
+    }
   }
 
-  // Return the full product.
+  // Return the full product, only unsigned for now.
   template <size_t OtherBits>
-  LIBC_INLINE constexpr auto
+  LIBC_INLINE constexpr BigInt<Bits + OtherBits, Signed, WordType>
   ful_mul(const BigInt<OtherBits, Signed, WordType> &other) const {
-    BigInt<Bits + OtherBits, Signed, WordType> result;
-    multiword::multiply_with_carry(result.val, val, other.val);
+    BigInt<Bits + OtherBits, Signed, WordType> result(0);
+    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
+    WordType carry = 0;
+    constexpr size_t OTHER_WORDCOUNT =
+        BigInt<OtherBits, Signed, WordType>::WORD_COUNT;
+    for (size_t i = 0; i <= WORD_COUNT + OTHER_WORDCOUNT - 2; ++i) {
+      const size_t lower_idx =
+          i < OTHER_WORDCOUNT ? 0 : i - OTHER_WORDCOUNT + 1;
+      const size_t upper_idx = i < WORD_COUNT ? i : WORD_COUNT - 1;
+      for (size_t j = lower_idx; j <= upper_idx; ++j) {
+        NumberPair<WordType> prod =
+            internal::full_mul(val[j], other.val[i - j]);
+        BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
+        carry += partial_sum.add(tmp);
+      }
+      result.val[i] = partial_sum.val[0];
+      partial_sum.val[0] = partial_sum.val[1];
+      partial_sum.val[1] = carry;
+      carry = 0;
+    }
+    result.val[WORD_COUNT + OTHER_WORDCOUNT - 1] = partial_sum.val[0];
     return result;
   }
 
-  LIBC_INLINE constexpr BigInt operator*(const BigInt &other) const {
-    // Perform full mul and truncate.
-    return BigInt(ful_mul(other));
-  }
-
   // Fast hi part of the full product.  The normal product `operator*` returns
   // `Bits` least significant bits of the full product, while this function will
   // approximate `Bits` most significant bits of the full product with errors
@@ -546,17 +407,39 @@ public:
   //    256      4        16          10            3
   //    512      8        64          36            7
   LIBC_INLINE constexpr BigInt quick_mul_hi(const BigInt &other) const {
-    BigInt result;
-    multiword::quick_mul_hi(result.val, val, other.val);
+    BigInt result(0);
+    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
+    WordType carry = 0;
+    // First round of accumulation for those at WORD_COUNT - 1 in the full
+    // product.
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      NumberPair<WordType> prod =
+          internal::full_mul(val[i], other.val[WORD_COUNT - 1 - i]);
+      BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
+      carry += partial_sum.add(tmp);
+    }
+    for (size_t i = WORD_COUNT; i < 2 * WORD_COUNT - 1; ++i) {
+      partial_sum.val[0] = partial_sum.val[1];
+      partial_sum.val[1] = carry;
+      carry = 0;
+      for (size_t j = i - WORD_COUNT + 1; j < WORD_COUNT; ++j) {
+        NumberPair<WordType> prod =
+            internal::full_mul(val[j], other.val[i - j]);
+        BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
+        carry += partial_sum.add(tmp);
+      }
+      result.val[i - WORD_COUNT] = partial_sum.val[0];
+    }
+    result.val[WORD_COUNT - 1] = partial_sum.val[1];
     return result;
   }
 
-  // BigInt(x).pow_n(n) computes x ^ n.
-  // Note 0 ^ 0 == 1.
+  // pow takes a power and sets this to its starting value to that power. Zero
+  // to the zeroth power returns 1.
   LIBC_INLINE constexpr void pow_n(uint64_t power) {
-    static_assert(!Signed);
-    BigInt result = one();
+    BigInt result = 1;
     BigInt cur_power = *this;
+
     while (power > 0) {
       if ((power % 2) > 0)
         result *= cur_power;
@@ -566,23 +449,38 @@ public:
     *this = result;
   }
 
-  // Performs inplace signed / unsigned division. Returns remainder if not
-  // dividing by zero.
-  // For signed numbers it behaves like C++ signed integer division.
-  // That is by truncating the fractionnal part
-  // https://stackoverflow.com/a/3602857
-  LIBC_INLINE constexpr cpp::optional<BigInt> div(const BigInt &divider) {
-    if (LIBC_UNLIKELY(divider.is_zero()))
+  // TODO: Make division work correctly for signed integers.
+
+  // div takes another BigInt of the same size and divides this by it. The value
+  // of this will be set to the quotient, and the return value is the remainder.
+  LIBC_INLINE constexpr cpp::optional<BigInt> div(const BigInt &other) {
+    BigInt remainder(0);
+    if (*this < other) {
+      remainder = *this;
+      *this = BigInt(0);
+      return remainder;
+    }
+    if (other == 1) {
+      return remainder;
+    }
+    if (other == 0) {
       return cpp::nullopt;
-    if (LIBC_UNLIKELY(divider == BigInt::one()))
-      return BigInt::zero();
-    Division result;
-    if constexpr (SIGNED)
-      result = divide_signed(*this, divider);
-    else
-      result = divide_unsigned(*this, divider);
-    *this = result.quotient;
-    return result.remainder;
+    }
+
+    BigInt quotient(0);
+    BigInt subtractor = other;
+    int cur_bit = static_cast<int>(subtractor.clz() - this->clz());
+    subtractor.shift_left(cur_bit);
+
+    for (; cur_bit >= 0 && *this > 0; --cur_bit, subtractor.shift_right(1)) {
+      if (*this >= subtractor) {
+        this->sub(subtractor);
+        quotient = quotient | (BigInt(1) << cur_bit);
+      }
+    }
+    remainder = *this;
+    *this = quotient;
+    return remainder;
   }
 
   // Efficiently perform BigInt / (x * 2^e), where x is a half-word-size
@@ -598,16 +496,19 @@ public:
   // computation of each step is now properly contained within WordType.
   //   And finally we perform some extra alignment steps for the remaining bits.
   LIBC_INLINE constexpr cpp::optional<BigInt>
-  div_uint_half_times_pow_2(multiword::half_width_t<WordType> x, size_t e) {
-    BigInt remainder;
-    if (x == 0)
+  div_uint_half_times_pow_2(internal::half_width_t<WordType> x, size_t e) {
+    BigInt remainder(0);
+
+    if (x == 0) {
       return cpp::nullopt;
+    }
     if (e >= Bits) {
       remainder = *this;
-      *this = BigInt<Bits, false, WordType>();
+      *this = BigInt<Bits, false, WordType>(0);
       return remainder;
     }
-    BigInt quotient;
+
+    BigInt quotient(0);
     WordType x_word = static_cast<WordType>(x);
     constexpr size_t LOG2_WORD_SIZE = cpp::bit_width(WORD_SIZE) - 1;
     constexpr size_t HALF_WORD_SIZE = WORD_SIZE >> 1;
@@ -732,22 +633,189 @@ public:
     return *this;
   }
 
-  LIBC_INLINE constexpr BigInt &operator<<=(size_t s) {
-    val = multiword::shift<multiword::LEFT, SIGNED>(val, s);
-    return *this;
+  // TODO: remove and use cpp::countl_zero below.
+  [[nodiscard]] LIBC_INLINE constexpr int clz() const {
+    constexpr int word_digits = cpp::numeric_limits<word_type>::digits;
+    int leading_zeroes = 0;
+    for (auto i = val.size(); i > 0;) {
+      --i;
+      const int zeroes = cpp::countl_zero(val[i]);
+      leading_zeroes += zeroes;
+      if (zeroes != word_digits)
+        break;
+    }
+    return leading_zeroes;
+  }
+
+  // TODO: remove and use cpp::countr_zero below.
+  [[nodiscard]] LIBC_INLINE constexpr int ctz() const {
+    constexpr int word_digits = cpp::numeric_limits<word_type>::digits;
+    int trailing_zeroes = 0;
+    for (auto word : val) {
+      const int zeroes = cpp::countr_zero(word);
+      trailing_zeroes += zeroes;
+      if (zeroes != word_digits)
+        break;
+    }
+    return trailing_zeroes;
+  }
+
+  LIBC_INLINE constexpr void shift_left(size_t s) {
+    if constexpr (Bits == WORD_SIZE) {
+      // Use native types if possible.
+      if (s >= WORD_SIZE) {
+        val[0] = 0;
+        return;
+      }
+      val[0] <<= s;
+      return;
+    }
+    if constexpr ((Bits == 64) && (WORD_SIZE == 32)) {
+      // Use builtin 64 bits for 32-bit base type if available;
+      if (s >= 64) {
+        val[0] = 0;
+        val[1] = 0;
+        return;
+      }
+      uint64_t tmp = uint64__t(val[0]) + (uint64_t(val[1]) << 62);
+      tmp <<= s;
+      val[0] = uint32_t(tmp);
+      val[1] = uint32_t(tmp >> 32);
+      return;
+    }
+#ifdef LIBC_TYPES_HAS_INT128
+    if constexpr ((Bits == 128) && (WORD_SIZE == 64)) {
+      // Use builtin 128 bits if available;
+      if (s >= 128) {
+        val[0] = 0;
+        val[1] = 0;
+        return;
+      }
+      __uint128_t tmp = __uint128_t(val[0]) + (__uint128_t(val[1]) << 64);
+      tmp <<= s;
+      val[0] = uint64_t(tmp);
+      val[1] = uint64_t(tmp >> 64);
+      return;
+    }
+#endif // LIBC_TYPES_HAS_INT128
+    if (LIBC_UNLIKELY(s == 0))
+      return;
+
+    const size_t drop = s / WORD_SIZE;  // Number of words to drop
+    const size_t shift = s % WORD_SIZE; // Bits to shift in the remaining words.
+    size_t i = WORD_COUNT;
+
+    if (drop < WORD_COUNT) {
+      i = WORD_COUNT - 1;
+      if (shift > 0) {
+        for (size_t j = WORD_COUNT - 1 - drop; j > 0; --i, --j) {
+          val[i] = (val[j] << shift) | (val[j - 1] >> (WORD_SIZE - shift));
+        }
+        val[i] = val[0] << shift;
+      } else {
+        for (size_t j = WORD_COUNT - 1 - drop; j > 0; --i, --j) {
+          val[i] = val[j];
+        }
+        val[i] = val[0];
+      }
+    }
+
+    for (size_t j = 0; j < i; ++j) {
+      val[j] = 0;
+    }
   }
 
   LIBC_INLINE constexpr BigInt operator<<(size_t s) const {
-    return BigInt(multiword::shift<multiword::LEFT, SIGNED>(val, s));
+    BigInt result(*this);
+    result.shift_left(s);
+    return result;
   }
 
-  LIBC_INLINE constexpr BigInt &operator>>=(size_t s) {
-    val = multiword::shift<multiword::RIGHT, SIGNED>(val, s);
+  LIBC_INLINE constexpr BigInt &operator<<=(size_t s) {
+    shift_left(s);
     return *this;
   }
 
+  LIBC_INLINE constexpr void shift_right(size_t s) {
+    if constexpr ((Bits == 64) && (WORD_SIZE == 32)) {
+      // Use builtin 64 bits if available;
+      if (s >= 64) {
+        val[0] = 0;
+        val[1] = 0;
+        return;
+      }
+      uint64_t tmp = uint64_t(val[0]) + (uint64_t(val[1]) << 32);
+      if constexpr (Signed) {
+        tmp = static_cast<uint64_t>(static_cast<int64_t>(tmp) >> s);
+      } else {
+        tmp >>= s;
+      }
+      val[0] = uint32_t(tmp);
+      val[1] = uint32_t(tmp >> 32);
+      return;
+    }
+#ifdef LIBC_TYPES_HAS_INT128
+    if constexpr ((Bits == 128) && (WORD_SIZE == 64)) {
+      // Use builtin 128 bits if available;
+      if (s >= 128) {
+        val[0] = 0;
+        val[1] = 0;
+        return;
+      }
+      __uint128_t tmp = __uint128_t(val[0]) + (__uint128_t(val[1]) << 64);
+      if constexpr (Signed) {
+        tmp = static_cast<__uint128_t>(static_cast<__int128_t>(tmp) >> s);
+      } else {
+        tmp >>= s;
+      }
+      val[0] = uint64_t(tmp);
+      val[1] = uint64_t(tmp >> 64);
+      return;
+    }
+#endif // LIBC_TYPES_HAS_INT128
+
+    if (LIBC_UNLIKELY(s == 0))
+      return;
+    const size_t drop = s / WORD_SIZE;  // Number of words to drop
+    const size_t shift = s % WORD_SIZE; // Bit shift in the remaining words.
+
+    size_t i = 0;
+    WordType sign = Signed ? is_neg() : 0;
+
+    if (drop < WORD_COUNT) {
+      if (shift > 0) {
+        for (size_t j = drop; j < WORD_COUNT - 1; ++i, ++j) {
+          val[i] = (val[j] >> shift) | (val[j + 1] << (WORD_SIZE - shift));
+        }
+        if constexpr (Signed) {
+          val[i] = static_cast<WordType>(
+              static_cast<cpp::make_signed_t<WordType>>(val[WORD_COUNT - 1]) >>
+              shift);
+        } else {
+          val[i] = val[WORD_COUNT - 1] >> shift;
+        }
+        ++i;
+      } else {
+        for (size_t j = drop; j < WORD_COUNT; ++i, ++j) {
+          val[i] = val[j];
+        }
+      }
+    }
+
+    for (; i < WORD_COUNT; ++i) {
+      val[i] = sign;
+    }
+  }
+
   LIBC_INLINE constexpr BigInt operator>>(size_t s) const {
-    return BigInt(multiword::shift<multiword::RIGHT, SIGNED>(val, s));
+    BigInt result(*this);
+    result.shift_right(s);
+    return result;
+  }
+
+  LIBC_INLINE constexpr BigInt &operator>>=(size_t s) {
+    shift_right(s);
+    return *this;
   }
 
 #define DEFINE_BINOP(OP)                                                       \
@@ -765,9 +833,10 @@ public:
     return lhs;                                                                \
   }
 
-  DEFINE_BINOP(&) // & and &=
-  DEFINE_BINOP(|) // | and |=
-  DEFINE_BINOP(^) // ^ and ^=
+  DEFINE_BINOP(&)
+  DEFINE_BINOP(|)
+  DEFINE_BINOP(^)
+
 #undef DEFINE_BINOP
 
   LIBC_INLINE constexpr BigInt operator~() const {
@@ -778,8 +847,8 @@ public:
   }
 
   LIBC_INLINE constexpr BigInt operator-() const {
-    BigInt result(*this);
-    result.negate();
+    BigInt result = ~(*this);
+    result.add(BigInt(1));
     return result;
   }
 
@@ -796,6 +865,24 @@ public:
     return !(lhs == rhs);
   }
 
+private:
+  LIBC_INLINE friend constexpr int cmp(const BigInt &lhs, const BigInt &rhs) {
+    const auto compare = [](WordType a, WordType b) {
+      return a == b ? 0 : a > b ? 1 : -1;
+    };
+    if constexpr (Signed) {
+      const bool lhs_is_neg = lhs.is_neg();
+      const bool rhs_is_neg = rhs.is_neg();
+      if (lhs_is_neg != rhs_is_neg)
+        return rhs_is_neg ? 1 : -1;
+    }
+    for (size_t i = WORD_COUNT; i-- > 0;)
+      if (auto cmp = compare(lhs[i], rhs[i]); cmp != 0)
+        return cmp;
+    return 0;
+  }
+
+public:
   LIBC_INLINE friend constexpr bool operator>(const BigInt &lhs,
                                               const BigInt &rhs) {
     return cmp(lhs, rhs) > 0;
@@ -814,24 +901,24 @@ public:
   }
 
   LIBC_INLINE constexpr BigInt &operator++() {
-    increment();
+    add(BigInt(1));
     return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator++(int) {
     BigInt oldval(*this);
-    increment();
+    add(BigInt(1));
     return oldval;
   }
 
   LIBC_INLINE constexpr BigInt &operator--() {
-    decrement();
+    sub(BigInt(1));
     return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator--(int) {
     BigInt oldval(*this);
-    decrement();
+    sub(BigInt(1));
     return oldval;
   }
 
@@ -843,117 +930,9 @@ public:
   // Return the i-th word of the number.
   LIBC_INLINE constexpr WordType &operator[](size_t i) { return val[i]; }
 
-private:
-  LIBC_INLINE friend constexpr int cmp(const BigInt &lhs, const BigInt &rhs) {
-    constexpr auto compare = [](WordType a, WordType b) {
-      return a == b ? 0 : a > b ? 1 : -1;
-    };
-    if constexpr (Signed) {
-      const bool lhs_is_neg = lhs.is_neg();
-      const bool rhs_is_neg = rhs.is_neg();
-      if (lhs_is_neg != rhs_is_neg)
-        return rhs_is_neg ? 1 : -1;
-    }
-    for (size_t i = WORD_COUNT; i-- > 0;)
-      if (auto cmp = compare(lhs[i], rhs[i]); cmp != 0)
-        return cmp;
-    return 0;
-  }
-
-  LIBC_INLINE constexpr void bitwise_not() {
-    for (auto &part : val)
-      part = ~part;
-  }
-
-  LIBC_INLINE constexpr void negate() {
-    bitwise_not();
-    increment();
-  }
+  LIBC_INLINE WordType *data() { return val; }
 
-  LIBC_INLINE constexpr void increment() {
-    multiword::add_with_carry(val, cpp::array<WordType, 1>{1});
-  }
-
-  LIBC_INLINE constexpr void decrement() {
-    multiword::add_with_carry(val, cpp::array<WordType, 1>{1});
-  }
-
-  LIBC_INLINE constexpr void extend(size_t index, bool is_neg) {
-    const WordType value = is_neg ? cpp::numeric_limits<WordType>::max()
-                                  : cpp::numeric_limits<WordType>::min();
-    for (size_t i = index; i < WORD_COUNT; ++i)
-      val[i] = value;
-  }
-
-  LIBC_INLINE constexpr bool get_msb() const {
-    return val.back() >> (WORD_SIZE - 1);
-  }
-
-  LIBC_INLINE constexpr void set_msb() {
-    val.back() |= mask_leading_ones<WordType, 1>();
-  }
-
-  LIBC_INLINE constexpr void clear_msb() {
-    val.back() &= mask_trailing_ones<WordType, WORD_SIZE - 1>();
-  }
-
-  LIBC_INLINE constexpr void set_bit(size_t i) {
-    const size_t word_index = i / WORD_SIZE;
-    val[word_index] |= WordType(1) << (i % WORD_SIZE);
-  }
-
-  LIBC_INLINE constexpr static Division divide_unsigned(const BigInt &dividend,
-                                                        const BigInt &divider) {
-    BigInt remainder = dividend;
-    BigInt quotient;
-    if (remainder >= divider) {
-      BigInt subtractor = divider;
-      int cur_bit = multiword::countl_zero(subtractor.val) -
-                    multiword::countl_zero(remainder.val);
-      subtractor <<= cur_bit;
-      for (; cur_bit >= 0 && remainder > 0; --cur_bit, subtractor >>= 1) {
-        if (remainder < subtractor)
-          continue;
-        remainder -= subtractor;
-        quotient.set_bit(cur_bit);
-      }
-    }
-    return Division{quotient, remainder};
-  }
-
-  LIBC_INLINE constexpr static Division divide_signed(const BigInt &dividend,
-                                                      const BigInt &divider) {
-    // Special case because it is not possible to negate the min value of a
-    // signed integer.
-    if (dividend == min() && divider == min())
-      return Division{one(), zero()};
-    // 1. Convert the dividend and divisor to unsigned representation.
-    unsigned_type udividend(dividend);
-    unsigned_type udivider(divider);
-    // 2. Negate the dividend if it's negative, and similarly for the divisor.
-    const bool dividend_is_neg = dividend.is_neg();
-    const bool divider_is_neg = divider.is_neg();
-    if (dividend_is_neg)
-      udividend.negate();
-    if (divider_is_neg)
-      udivider.negate();
-    // 3. Use unsigned multiword division algorithm.
-    const auto unsigned_result = divide_unsigned(udividend, udivider);
-    // 4. Convert the quotient and remainder to signed representation.
-    Division result;
-    result.quotient = signed_type(unsigned_result.quotient);
-    result.remainder = signed_type(unsigned_result.remainder);
-    // 5. Negate the quotient if the dividend and divisor had opposite signs.
-    if (dividend_is_neg != divider_is_neg)
-      result.quotient.negate();
-    // 6. Negate the remainder if the dividend was negative.
-    if (dividend_is_neg)
-      result.remainder.negate();
-    return result;
-  }
-
-  friend signed_type;
-  friend unsigned_type;
+  LIBC_INLINE const WordType *data() const { return val; }
 };
 
 namespace internal {
@@ -983,8 +962,10 @@ using Int = BigInt<Bits, true, internal::WordTypeSelectorT<Bits>>;
 // Provides limits of U/Int<128>.
 template <> class cpp::numeric_limits<UInt<128>> {
 public:
-  LIBC_INLINE static constexpr UInt<128> max() { return UInt<128>::max(); }
-  LIBC_INLINE static constexpr UInt<128> min() { return UInt<128>::min(); }
+  LIBC_INLINE static constexpr UInt<128> max() {
+    return UInt<128>({0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff});
+  }
+  LIBC_INLINE static constexpr UInt<128> min() { return UInt<128>(0); }
   // Meant to match std::numeric_limits interface.
   // NOLINTNEXTLINE(readability-identifier-naming)
   LIBC_INLINE_VAR static constexpr int digits = 128;
@@ -992,8 +973,12 @@ public:
 
 template <> class cpp::numeric_limits<Int<128>> {
 public:
-  LIBC_INLINE static constexpr Int<128> max() { return Int<128>::max(); }
-  LIBC_INLINE static constexpr Int<128> min() { return Int<128>::min(); }
+  LIBC_INLINE static constexpr Int<128> max() {
+    return Int<128>({0xffff'ffff'ffff'ffff, 0x7fff'ffff'ffff'ffff});
+  }
+  LIBC_INLINE static constexpr Int<128> min() {
+    return Int<128>({0, 0x8000'0000'0000'0000});
+  }
   // Meant to match std::numeric_limits interface.
   // NOLINTNEXTLINE(readability-identifier-naming)
   LIBC_INLINE_VAR static constexpr int digits = 128;
@@ -1127,28 +1112,30 @@ has_single_bit(T value) {
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countr_zero(const T &value) {
-  return multiword::countr_zero(value.val);
+  return value.ctz();
 }
 
 // Specialization of cpp::countl_zero ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countl_zero(const T &value) {
-  return multiword::countl_zero(value.val);
+  return value.clz();
 }
 
 // Specialization of cpp::countl_one ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countl_one(T value) {
-  return multiword::countl_one(value.val);
+  // TODO : Implement a faster version not involving operator~.
+  return cpp::countl_zero<T>(~value);
 }
 
 // Specialization of cpp::countr_one ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countr_one(T value) {
-  return multiword::countr_one(value.val);
+  // TODO : Implement a faster version not involving operator~.
+  return cpp::countr_zero<T>(~value);
 }
 
 // Specialization of cpp::bit_width ('bit.h') for BigInt.
@@ -1195,59 +1182,65 @@ rotr(T value, int rotate) {
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
 mask_trailing_ones() {
-  static_assert(!T::SIGNED && count <= T::BITS);
-  if (count == T::BITS)
-    return T::all_ones();
-  constexpr size_t QUOTIENT = count / T::WORD_SIZE;
-  constexpr size_t REMAINDER = count % T::WORD_SIZE;
-  T out; // zero initialized
-  for (size_t i = 0; i <= QUOTIENT; ++i)
-    out[i] = i < QUOTIENT
-                 ? -1
-                 : mask_trailing_ones<typename T::word_type, REMAINDER>();
+  static_assert(!T::SIGNED);
+  if (count == 0)
+    return T();
+  constexpr unsigned T_BITS = CHAR_BIT * sizeof(T);
+  static_assert(count <= T_BITS && "Invalid bit index");
+  using word_type = typename T::word_type;
+  T out;
+  constexpr int CHUNK_INDEX_CONTAINING_BIT =
+      static_cast<int>(count / T::WORD_SIZE);
+  int index = 0;
+  for (auto &word : out.val) {
+    if (index < CHUNK_INDEX_CONTAINING_BIT)
+      word = -1;
+    else if (index > CHUNK_INDEX_CONTAINING_BIT)
+      word = 0;
+    else
+      word = mask_trailing_ones<word_type, count % T::WORD_SIZE>();
+    ++index;
+  }
   return out;
 }
 
 // Specialization of mask_leading_ones ('math_extras.h') for BigInt.
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T> mask_leading_ones() {
-  static_assert(!T::SIGNED && count <= T::BITS);
-  if (count == T::BITS)
-    return T::all_ones();
-  constexpr size_t QUOTIENT = (T::BITS - count - 1U) / T::WORD_SIZE;
-  constexpr size_t REMAINDER = count % T::WORD_SIZE;
-  T out; // zero initialized
-  for (size_t i = QUOTIENT; i < T::WORD_COUNT; ++i)
-    out[i] = i > QUOTIENT
-                 ? -1
-                 : mask_leading_ones<typename T::word_type, REMAINDER>();
+  static_assert(!T::SIGNED);
+  if (count == 0)
+    return T();
+  constexpr unsigned T_BITS = CHAR_BIT * sizeof(T);
+  static_assert(count <= T_BITS && "Invalid bit index");
+  using word_type = typename T::word_type;
+  T out;
+  constexpr int CHUNK_INDEX_CONTAINING_BIT =
+      static_cast<int>((T::BITS - count - 1ULL) / T::WORD_SIZE);
+  int index = 0;
+  for (auto &word : out.val) {
+    if (index < CHUNK_INDEX_CONTAINING_BIT)
+      word = 0;
+    else if (index > CHUNK_INDEX_CONTAINING_BIT)
+      word = -1;
+    else
+      word = mask_leading_ones<word_type, count % T::WORD_SIZE>();
+    ++index;
+  }
   return out;
 }
 
-// Specialization of mask_trailing_zeros ('math_extras.h') for BigInt.
-template <typename T, size_t count>
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
-mask_trailing_zeros() {
-  return mask_leading_ones<T, T::BITS - count>();
-}
-
-// Specialization of mask_leading_zeros ('math_extras.h') for BigInt.
-template <typename T, size_t count>
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
-mask_leading_zeros() {
-  return mask_trailing_ones<T, T::BITS - count>();
-}
-
 // Specialization of count_zeros ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 count_zeros(T value) {
   return cpp::popcount(~value);
 }
 
 // Specialization of first_leading_zero ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_leading_zero(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countl_one(value) + 1;
@@ -1255,14 +1248,16 @@ first_leading_zero(T value) {
 
 // Specialization of first_leading_one ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_leading_one(T value) {
   return first_leading_zero(~value);
 }
 
 // Specialization of first_trailing_zero ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_trailing_zero(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countr_zero(~value) + 1;
@@ -1270,7 +1265,8 @@ first_trailing_zero(T value) {
 
 // Specialization of first_trailing_one ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]]
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_trailing_one(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countr_zero(value) + 1;
diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
index 4c59cfd99c2e..1287c3e9a84f 100644
--- a/libc/src/__support/float_to_string.h
+++ b/libc/src/__support/float_to_string.h
@@ -689,7 +689,7 @@ template <> class FloatToString<long double> {
 
       wide_int float_as_int = mantissa;
 
-      float_as_int <<= exponent;
+      float_as_int.shift_left(exponent);
       int_block_index = 0;
 
       while (float_as_int > 0) {
@@ -708,11 +708,10 @@ template <> class FloatToString<long double> {
 
       const int SHIFT_AMOUNT = FLOAT_AS_INT_WIDTH + exponent;
       static_assert(EXTRA_INT_WIDTH >= sizeof(long double) * 8);
-      float_as_fixed <<= SHIFT_AMOUNT;
+      float_as_fixed.shift_left(SHIFT_AMOUNT);
 
       // If there are still digits above the decimal point, handle those.
-      if (cpp::countl_zero(float_as_fixed) <
-          static_cast<int>(EXTRA_INT_WIDTH)) {
+      if (float_as_fixed.clz() < static_cast<int>(EXTRA_INT_WIDTH)) {
         UInt<EXTRA_INT_WIDTH> above_decimal_point =
             float_as_fixed >> FLOAT_AS_INT_WIDTH;
 
diff --git a/libc/src/__support/integer_literals.h b/libc/src/__support/integer_literals.h
index e99799c3512e..de1f88fbd3f3 100644
--- a/libc/src/__support/integer_literals.h
+++ b/libc/src/__support/integer_literals.h
@@ -151,15 +151,12 @@ template <size_t N> struct Parser<LIBC_NAMESPACE::UInt<N>> {
 template <typename T>
 LIBC_INLINE constexpr T parse_with_prefix(const char *ptr) {
   using P = Parser<T>;
-  if (ptr == nullptr)
-    return T();
-  if (ptr[0] == '0') {
-    if (ptr[1] == 'b')
-      return P::template parse<2>(ptr + 2);
-    if (ptr[1] == 'x')
-      return P::template parse<16>(ptr + 2);
-  }
-  return P::template parse<10>(ptr);
+  if (ptr[0] == '0' && ptr[1] == 'x')
+    return P::template parse<16>(ptr + 2);
+  else if (ptr[0] == '0' && ptr[1] == 'b')
+    return P::template parse<2>(ptr + 2);
+  else
+    return P::template parse<10>(ptr);
 }
 
 } // namespace internal
@@ -172,16 +169,6 @@ LIBC_INLINE constexpr auto operator""_u256(const char *x) {
   return internal::parse_with_prefix<UInt<256>>(x);
 }
 
-template <typename T> LIBC_INLINE constexpr T parse_bigint(const char *ptr) {
-  if (ptr == nullptr)
-    return T();
-  if (ptr[0] == '-' || ptr[0] == '+') {
-    auto positive = internal::parse_with_prefix<T>(ptr + 1);
-    return ptr[0] == '-' ? -positive : positive;
-  }
-  return internal::parse_with_prefix<T>(ptr);
-}
-
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_INTEGER_LITERALS_H
diff --git a/libc/src/__support/math_extras.h b/libc/src/__support/math_extras.h
index bb6424bfa8e4..70a8800b285d 100644
--- a/libc/src/__support/math_extras.h
+++ b/libc/src/__support/math_extras.h
@@ -10,9 +10,9 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXTRAS_H
 #define LLVM_LIBC_SRC___SUPPORT_MATH_EXTRAS_H
 
-#include "src/__support/CPP/bit.h"         // countl_one, countr_zero
-#include "src/__support/CPP/limits.h"      // CHAR_BIT, numeric_limits
-#include "src/__support/CPP/type_traits.h" // is_unsigned_v, is_constant_evaluated
+#include "src/__support/CPP/bit.h"           // countl_one, countr_zero
+#include "src/__support/CPP/limits.h"        // CHAR_BIT, numeric_limits
+#include "src/__support/CPP/type_traits.h"   // is_unsigned_v
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 
 namespace LIBC_NAMESPACE {
@@ -32,94 +32,199 @@ mask_trailing_ones() {
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
 mask_leading_ones() {
-  return T(~mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>());
+  constexpr T MASK(mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>());
+  return T(~MASK); // bitwise NOT performs integer promotion.
 }
 
-// Create a bitmask with the count right-most bits set to 0, and all other bits
-// set to 1.  Only unsigned types are allowed.
-template <typename T, size_t count>
-LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
-mask_trailing_zeros() {
-  return mask_leading_ones<T, CHAR_BIT * sizeof(T) - count>();
+// Add with carry
+template <typename T> struct SumCarry {
+  T sum;
+  T carry;
+};
+
+// This version is always valid for constexpr.
+template <typename T>
+LIBC_INLINE constexpr cpp::enable_if_t<
+    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, SumCarry<T>>
+add_with_carry_const(T a, T b, T carry_in) {
+  T tmp = a + carry_in;
+  T sum = b + tmp;
+  T carry_out = (sum < b) + (tmp < a);
+  return {sum, carry_out};
 }
 
-// Create a bitmask with the count left-most bits set to 0, and all other bits
-// set to 1.  Only unsigned types are allowed.
-template <typename T, size_t count>
-LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
-mask_leading_zeros() {
-  return mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>();
+template <typename T>
+LIBC_INLINE constexpr cpp::enable_if_t<
+    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, SumCarry<T>>
+add_with_carry(T a, T b, T carry_in) {
+  return add_with_carry_const<T>(a, b, carry_in);
+}
+
+#if __has_builtin(__builtin_addc)
+// https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
+
+template <>
+LIBC_INLINE constexpr SumCarry<unsigned char>
+add_with_carry<unsigned char>(unsigned char a, unsigned char b,
+                              unsigned char carry_in) {
+  if (__builtin_is_constant_evaluated()) {
+    return add_with_carry_const<unsigned char>(a, b, carry_in);
+  } else {
+    SumCarry<unsigned char> result{0, 0};
+    result.sum = __builtin_addcb(a, b, carry_in, &result.carry);
+    return result;
+  }
+}
+
+template <>
+LIBC_INLINE constexpr SumCarry<unsigned short>
+add_with_carry<unsigned short>(unsigned short a, unsigned short b,
+                               unsigned short carry_in) {
+  if (__builtin_is_constant_evaluated()) {
+    return add_with_carry_const<unsigned short>(a, b, carry_in);
+  } else {
+    SumCarry<unsigned short> result{0, 0};
+    result.sum = __builtin_addcs(a, b, carry_in, &result.carry);
+    return result;
+  }
+}
+
+template <>
+LIBC_INLINE constexpr SumCarry<unsigned int>
+add_with_carry<unsigned int>(unsigned int a, unsigned int b,
+                             unsigned int carry_in) {
+  if (__builtin_is_constant_evaluated()) {
+    return add_with_carry_const<unsigned int>(a, b, carry_in);
+  } else {
+    SumCarry<unsigned int> result{0, 0};
+    result.sum = __builtin_addc(a, b, carry_in, &result.carry);
+    return result;
+  }
+}
+
+template <>
+LIBC_INLINE constexpr SumCarry<unsigned long>
+add_with_carry<unsigned long>(unsigned long a, unsigned long b,
+                              unsigned long carry_in) {
+  if (__builtin_is_constant_evaluated()) {
+    return add_with_carry_const<unsigned long>(a, b, carry_in);
+  } else {
+    SumCarry<unsigned long> result{0, 0};
+    result.sum = __builtin_addcl(a, b, carry_in, &result.carry);
+    return result;
+  }
+}
+
+template <>
+LIBC_INLINE constexpr SumCarry<unsigned long long>
+add_with_carry<unsigned long long>(unsigned long long a, unsigned long long b,
+                                   unsigned long long carry_in) {
+  if (__builtin_is_constant_evaluated()) {
+    return add_with_carry_const<unsigned long long>(a, b, carry_in);
+  } else {
+    SumCarry<unsigned long long> result{0, 0};
+    result.sum = __builtin_addcll(a, b, carry_in, &result.carry);
+    return result;
+  }
 }
 
-// Returns whether 'a + b' overflows, the result is stored in 'res'.
+#endif // __has_builtin(__builtin_addc)
+
+// Subtract with borrow
+template <typename T> struct DiffBorrow {
+  T diff;
+  T borrow;
+};
+
+// This version is always valid for constexpr.
 template <typename T>
-[[nodiscard]] LIBC_INLINE constexpr bool add_overflow(T a, T b, T &res) {
-  return __builtin_add_overflow(a, b, &res);
+LIBC_INLINE constexpr cpp::enable_if_t<
+    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, DiffBorrow<T>>
+sub_with_borrow_const(T a, T b, T borrow_in) {
+  T tmp = a - b;
+  T diff = tmp - borrow_in;
+  T borrow_out = (diff > tmp) + (tmp > a);
+  return {diff, borrow_out};
 }
 
-// Returns whether 'a - b' overflows, the result is stored in 'res'.
+// This version is not always valid for constepxr because it's overriden below
+// if builtins are available.
 template <typename T>
-[[nodiscard]] LIBC_INLINE constexpr bool sub_overflow(T a, T b, T &res) {
-  return __builtin_sub_overflow(a, b, &res);
+LIBC_INLINE constexpr cpp::enable_if_t<
+    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, DiffBorrow<T>>
+sub_with_borrow(T a, T b, T borrow_in) {
+  return sub_with_borrow_const<T>(a, b, borrow_in);
 }
 
-#define RETURN_IF(TYPE, BUILTIN)                                               \
-  if constexpr (cpp::is_same_v<T, TYPE>)                                       \
-    return BUILTIN(a, b, carry_in, carry_out);
+#if __has_builtin(__builtin_subc)
+// https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
 
-// Returns the result of 'a + b' taking into account 'carry_in'.
-// The carry out is stored in 'carry_out' it not 'nullptr', dropped otherwise.
-// We keep the pass by pointer interface for consistency with the intrinsic.
-template <typename T>
-[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
-add_with_carry(T a, T b, T carry_in, T &carry_out) {
-  if constexpr (!cpp::is_constant_evaluated()) {
-#if __has_builtin(__builtin_addcb)
-    RETURN_IF(unsigned char, __builtin_addcb)
-#elif __has_builtin(__builtin_addcs)
-    RETURN_IF(unsigned short, __builtin_addcs)
-#elif __has_builtin(__builtin_addc)
-    RETURN_IF(unsigned int, __builtin_addc)
-#elif __has_builtin(__builtin_addcl)
-    RETURN_IF(unsigned long, __builtin_addcl)
-#elif __has_builtin(__builtin_addcll)
-    RETURN_IF(unsigned long long, __builtin_addcll)
-#endif
+template <>
+LIBC_INLINE constexpr DiffBorrow<unsigned char>
+sub_with_borrow<unsigned char>(unsigned char a, unsigned char b,
+                               unsigned char borrow_in) {
+  if (__builtin_is_constant_evaluated()) {
+    return sub_with_borrow_const<unsigned char>(a, b, borrow_in);
+  } else {
+    DiffBorrow<unsigned char> result{0, 0};
+    result.diff = __builtin_subcb(a, b, borrow_in, &result.borrow);
+    return result;
   }
-  T sum;
-  T carry1 = add_overflow(a, b, sum);
-  T carry2 = add_overflow(sum, carry_in, sum);
-  carry_out = carry1 | carry2;
-  return sum;
 }
 
-// Returns the result of 'a - b' taking into account 'carry_in'.
-// The carry out is stored in 'carry_out' it not 'nullptr', dropped otherwise.
-// We keep the pass by pointer interface for consistency with the intrinsic.
-template <typename T>
-[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
-sub_with_borrow(T a, T b, T carry_in, T &carry_out) {
-  if constexpr (!cpp::is_constant_evaluated()) {
-#if __has_builtin(__builtin_subcb)
-    RETURN_IF(unsigned char, __builtin_subcb)
-#elif __has_builtin(__builtin_subcs)
-    RETURN_IF(unsigned short, __builtin_subcs)
-#elif __has_builtin(__builtin_subc)
-    RETURN_IF(unsigned int, __builtin_subc)
-#elif __has_builtin(__builtin_subcl)
-    RETURN_IF(unsigned long, __builtin_subcl)
-#elif __has_builtin(__builtin_subcll)
-    RETURN_IF(unsigned long long, __builtin_subcll)
-#endif
+template <>
+LIBC_INLINE constexpr DiffBorrow<unsigned short>
+sub_with_borrow<unsigned short>(unsigned short a, unsigned short b,
+                                unsigned short borrow_in) {
+  if (__builtin_is_constant_evaluated()) {
+    return sub_with_borrow_const<unsigned short>(a, b, borrow_in);
+  } else {
+    DiffBorrow<unsigned short> result{0, 0};
+    result.diff = __builtin_subcs(a, b, borrow_in, &result.borrow);
+    return result;
+  }
+}
+
+template <>
+LIBC_INLINE constexpr DiffBorrow<unsigned int>
+sub_with_borrow<unsigned int>(unsigned int a, unsigned int b,
+                              unsigned int borrow_in) {
+  if (__builtin_is_constant_evaluated()) {
+    return sub_with_borrow_const<unsigned int>(a, b, borrow_in);
+  } else {
+    DiffBorrow<unsigned int> result{0, 0};
+    result.diff = __builtin_subc(a, b, borrow_in, &result.borrow);
+    return result;
+  }
+}
+
+template <>
+LIBC_INLINE constexpr DiffBorrow<unsigned long>
+sub_with_borrow<unsigned long>(unsigned long a, unsigned long b,
+                               unsigned long borrow_in) {
+  if (__builtin_is_constant_evaluated()) {
+    return sub_with_borrow_const<unsigned long>(a, b, borrow_in);
+  } else {
+    DiffBorrow<unsigned long> result{0, 0};
+    result.diff = __builtin_subcl(a, b, borrow_in, &result.borrow);
+    return result;
+  }
+}
+
+template <>
+LIBC_INLINE constexpr DiffBorrow<unsigned long long>
+sub_with_borrow<unsigned long long>(unsigned long long a, unsigned long long b,
+                                    unsigned long long borrow_in) {
+  if (__builtin_is_constant_evaluated()) {
+    return sub_with_borrow_const<unsigned long long>(a, b, borrow_in);
+  } else {
+    DiffBorrow<unsigned long long> result{0, 0};
+    result.diff = __builtin_subcll(a, b, borrow_in, &result.borrow);
+    return result;
   }
-  T sub;
-  T carry1 = sub_overflow(a, b, sub);
-  T carry2 = sub_overflow(sub, carry_in, sub);
-  carry_out = carry1 | carry2;
-  return sub;
 }
 
-#undef RETURN_IF
+#endif // __has_builtin(__builtin_subc)
 
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, int>
diff --git a/libc/src/__support/number_pair.h b/libc/src/__support/number_pair.h
index 2f713fc03520..ee6667b1299f 100644
--- a/libc/src/__support/number_pair.h
+++ b/libc/src/__support/number_pair.h
@@ -20,6 +20,17 @@ template <typename T> struct NumberPair {
   T hi = T(0);
 };
 
+template <typename T>
+cpp::enable_if_t<cpp::is_integral_v<T> && cpp::is_unsigned_v<T>,
+                 NumberPair<T>> constexpr split(T a) {
+  constexpr size_t HALF_BIT_WIDTH = sizeof(T) * 4;
+  constexpr T LOWER_HALF_MASK = (T(1) << HALF_BIT_WIDTH) - T(1);
+  NumberPair<T> result;
+  result.lo = a & LOWER_HALF_MASK;
+  result.hi = a >> HALF_BIT_WIDTH;
+  return result;
+}
+
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_NUMBER_PAIR_H
diff --git a/libc/test/src/__support/integer_literals_test.cpp b/libc/test/src/__support/integer_literals_test.cpp
index cbc906aa7c99..5298cf30156e 100644
--- a/libc/test/src/__support/integer_literals_test.cpp
+++ b/libc/test/src/__support/integer_literals_test.cpp
@@ -133,24 +133,3 @@ TEST(LlvmLibcIntegerLiteralTest, u256) {
       U256_MAX,
       0xFFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF_u256);
 }
-
-TEST(LlvmLibcIntegerLiteralTest, parse_bigint) {
-  using T = LIBC_NAMESPACE::Int<128>;
-  struct {
-    const char *str;
-    T expected;
-  } constexpr TEST_CASES[] = {
-      {"0", 0}, {"-1", -1}, {"+1", 1}, {"-0xFF", -255}, {"-0b11", -3},
-  };
-  for (auto tc : TEST_CASES) {
-    T actual = LIBC_NAMESPACE::parse_bigint<T>(tc.str);
-    EXPECT_EQ(actual, tc.expected);
-  }
-}
-
-TEST(LlvmLibcIntegerLiteralTest, parse_bigint_invalid) {
-  using T = LIBC_NAMESPACE::Int<128>;
-  const T expected; // default construction
-  EXPECT_EQ(LIBC_NAMESPACE::parse_bigint<T>(nullptr), expected);
-  EXPECT_EQ(LIBC_NAMESPACE::parse_bigint<T>(""), expected);
-}
diff --git a/libc/test/src/__support/math_extras_test.cpp b/libc/test/src/__support/math_extras_test.cpp
index 401e631ea4ba..e88b3e1d6b68 100644
--- a/libc/test/src/__support/math_extras_test.cpp
+++ b/libc/test/src/__support/math_extras_test.cpp
@@ -101,61 +101,4 @@ TYPED_TEST(LlvmLibcBitTest, CountZeros, UnsignedTypesNoBigInt) {
     EXPECT_EQ(count_zeros<T>(cpp::numeric_limits<T>::max() >> i), i);
 }
 
-using UnsignedTypes = testing::TypeList<
-#if defined(__SIZEOF_INT128__)
-    __uint128_t,
-#endif
-    unsigned char, unsigned short, unsigned int, unsigned long,
-    unsigned long long>;
-
-TYPED_TEST(LlvmLibcBlockMathExtrasTest, add_overflow, UnsignedTypes) {
-  constexpr T ZERO = cpp::numeric_limits<T>::min();
-  constexpr T ONE(1);
-  constexpr T MAX = cpp::numeric_limits<T>::max();
-  constexpr T BEFORE_MAX = MAX - 1;
-
-  const struct {
-    T lhs;
-    T rhs;
-    T sum;
-    bool carry;
-  } TESTS[] = {
-      {ZERO, ONE, ONE, false},       // 0x00 + 0x01 = 0x01
-      {BEFORE_MAX, ONE, MAX, false}, // 0xFE + 0x01 = 0xFF
-      {MAX, ONE, ZERO, true},        // 0xFF + 0x01 = 0x00 (carry)
-      {MAX, MAX, BEFORE_MAX, true},  // 0xFF + 0xFF = 0xFE (carry)
-  };
-  for (auto tc : TESTS) {
-    T sum;
-    bool carry = add_overflow<T>(tc.lhs, tc.rhs, sum);
-    EXPECT_EQ(sum, tc.sum);
-    EXPECT_EQ(carry, tc.carry);
-  }
-}
-
-TYPED_TEST(LlvmLibcBlockMathExtrasTest, sub_overflow, UnsignedTypes) {
-  constexpr T ZERO = cpp::numeric_limits<T>::min();
-  constexpr T ONE(1);
-  constexpr T MAX = cpp::numeric_limits<T>::max();
-  constexpr T BEFORE_MAX = MAX - 1;
-
-  const struct {
-    T lhs;
-    T rhs;
-    T sub;
-    bool carry;
-  } TESTS[] = {
-      {ONE, ZERO, ONE, false},      // 0x01 - 0x00 = 0x01
-      {MAX, MAX, ZERO, false},      // 0xFF - 0xFF = 0x00
-      {ZERO, ONE, MAX, true},       // 0x00 - 0x01 = 0xFF (carry)
-      {BEFORE_MAX, MAX, MAX, true}, // 0xFE - 0xFF = 0xFF (carry)
-  };
-  for (auto tc : TESTS) {
-    T sub;
-    bool carry = sub_overflow<T>(tc.lhs, tc.rhs, sub);
-    EXPECT_EQ(sub, tc.sub);
-    EXPECT_EQ(carry, tc.carry);
-  }
-}
-
 } // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp
index 5696e54c73f3..5764324ca288 100644
--- a/libc/test/src/__support/uint_test.cpp
+++ b/libc/test/src/__support/uint_test.cpp
@@ -8,7 +8,6 @@
 
 #include "src/__support/CPP/optional.h"
 #include "src/__support/UInt.h"
-#include "src/__support/integer_literals.h"        // parse_unsigned_bigint
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
 
 #include "include/llvm-libc-macros/math-macros.h" // HUGE_VALF, HUGE_VALF
@@ -16,195 +15,6 @@
 
 namespace LIBC_NAMESPACE {
 
-enum Value { ZERO, ONE, TWO, MIN, MAX };
-
-template <typename T> auto create(Value value) {
-  switch (value) {
-  case ZERO:
-    return T(0);
-  case ONE:
-    return T(1);
-  case TWO:
-    return T(2);
-  case MIN:
-    return T::min();
-  case MAX:
-    return T::max();
-  }
-}
-
-using Types = testing::TypeList< //
-#ifdef LIBC_TYPES_HAS_INT64
-    BigInt<64, false, uint64_t>, // 64-bits unsigned (1 x uint64_t)
-    BigInt<64, true, uint64_t>,  // 64-bits   signed (1 x uint64_t)
-#endif
-#ifdef LIBC_TYPES_HAS_INT128
-    BigInt<128, false, __uint128_t>, // 128-bits unsigned (1 x __uint128_t)
-    BigInt<128, true, __uint128_t>,  // 128-bits   signed (1 x __uint128_t)
-#endif
-    BigInt<16, false, uint16_t>, // 16-bits unsigned (1 x uint16_t)
-    BigInt<16, true, uint16_t>,  // 16-bits   signed (1 x uint16_t)
-    BigInt<64, false, uint16_t>, // 64-bits unsigned (4 x uint16_t)
-    BigInt<64, true, uint16_t>   // 64-bits   signed (4 x uint16_t)
-    >;
-
-#define ASSERT_SAME(A, B) ASSERT_TRUE((A) == (B))
-
-TYPED_TEST(LlvmLibcUIntClassTest, Additions, Types) {
-  ASSERT_SAME(create<T>(ZERO) + create<T>(ZERO), create<T>(ZERO));
-  ASSERT_SAME(create<T>(ONE) + create<T>(ZERO), create<T>(ONE));
-  ASSERT_SAME(create<T>(ZERO) + create<T>(ONE), create<T>(ONE));
-  ASSERT_SAME(create<T>(ONE) + create<T>(ONE), create<T>(TWO));
-  // 2's complement addition works for signed and unsigned types.
-  // - unsigned : 0xff + 0x01 = 0x00 (255 + 1 = 0)
-  // -   signed : 0xef + 0x01 = 0xf0 (127 + 1 = -128)
-  ASSERT_SAME(create<T>(MAX) + create<T>(ONE), create<T>(MIN));
-}
-
-TYPED_TEST(LlvmLibcUIntClassTest, Subtraction, Types) {
-  ASSERT_SAME(create<T>(ZERO) - create<T>(ZERO), create<T>(ZERO));
-  ASSERT_SAME(create<T>(ONE) - create<T>(ONE), create<T>(ZERO));
-  ASSERT_SAME(create<T>(ONE) - create<T>(ZERO), create<T>(ONE));
-  // 2's complement subtraction works for signed and unsigned types.
-  // - unsigned : 0x00 - 0x01 = 0xff (   0 - 1 = 255)
-  // -   signed : 0xf0 - 0x01 = 0xef (-128 - 1 = 127)
-  ASSERT_SAME(create<T>(MIN) - create<T>(ONE), create<T>(MAX));
-}
-
-TYPED_TEST(LlvmLibcUIntClassTest, Multiplication, Types) {
-  ASSERT_SAME(create<T>(ZERO) * create<T>(ZERO), create<T>(ZERO));
-  ASSERT_SAME(create<T>(ZERO) * create<T>(ONE), create<T>(ZERO));
-  ASSERT_SAME(create<T>(ONE) * create<T>(ZERO), create<T>(ZERO));
-  ASSERT_SAME(create<T>(ONE) * create<T>(ONE), create<T>(ONE));
-  ASSERT_SAME(create<T>(ONE) * create<T>(TWO), create<T>(TWO));
-  ASSERT_SAME(create<T>(TWO) * create<T>(ONE), create<T>(TWO));
-  // - unsigned : 0xff x 0xff = 0x01 (mod 0xff)
-  // -   signed : 0xef x 0xef = 0x01 (mod 0xff)
-  ASSERT_SAME(create<T>(MAX) * create<T>(MAX), create<T>(ONE));
-}
-
-template <typename T> void print(const char *msg, T value) {
-  testing::tlog << msg;
-  IntegerToString<T, radix::Hex> buffer(value);
-  testing::tlog << buffer.view() << "\n";
-}
-
-TEST(LlvmLibcUIntClassTest, SignedAddSub) {
-  // Computations performed by https://www.wolframalpha.com/
-  using T = BigInt<128, true, uint32_t>;
-  const T a = parse_bigint<T>("1927508279017230597");
-  const T b = parse_bigint<T>("278789278723478925");
-  const T s = parse_bigint<T>("2206297557740709522");
-  // Addition
-  ASSERT_SAME(a + b, s);
-  ASSERT_SAME(b + a, s); // commutative
-  // Subtraction
-  ASSERT_SAME(a - s, -b);
-  ASSERT_SAME(s - a, b);
-}
-
-TEST(LlvmLibcUIntClassTest, SignedMulDiv) {
-  // Computations performed by https://www.wolframalpha.com/
-  using T = BigInt<128, true, uint16_t>;
-  struct {
-    const char *a;
-    const char *b;
-    const char *mul;
-  } const test_cases[] = {{"-4", "3", "-12"},
-                          {"-3", "-3", "9"},
-                          {"1927508279017230597", "278789278723478925",
-                           "537368642840747885329125014794668225"}};
-  for (auto tc : test_cases) {
-    const T a = parse_bigint<T>(tc.a);
-    const T b = parse_bigint<T>(tc.b);
-    const T mul = parse_bigint<T>(tc.mul);
-    // Multiplication
-    ASSERT_SAME(a * b, mul);
-    ASSERT_SAME(b * a, mul);   // commutative
-    ASSERT_SAME(a * -b, -mul); // sign
-    ASSERT_SAME(-a * b, -mul); // sign
-    ASSERT_SAME(-a * -b, mul); // sign
-    // Division
-    ASSERT_SAME(mul / a, b);
-    ASSERT_SAME(mul / b, a);
-    ASSERT_SAME(-mul / a, -b); // sign
-    ASSERT_SAME(mul / -a, -b); // sign
-    ASSERT_SAME(-mul / -a, b); // sign
-  }
-}
-
-TYPED_TEST(LlvmLibcUIntClassTest, Division, Types) {
-  ASSERT_SAME(create<T>(ZERO) / create<T>(ONE), create<T>(ZERO));
-  ASSERT_SAME(create<T>(MAX) / create<T>(ONE), create<T>(MAX));
-  ASSERT_SAME(create<T>(MAX) / create<T>(MAX), create<T>(ONE));
-  ASSERT_SAME(create<T>(ONE) / create<T>(ONE), create<T>(ONE));
-  if constexpr (T::SIGNED) {
-    // Special case found by fuzzing.
-    ASSERT_SAME(create<T>(MIN) / create<T>(MIN), create<T>(ONE));
-  }
-  // - unsigned : 0xff / 0x02 = 0x7f
-  // -   signed : 0xef / 0x02 = 0x77
-  ASSERT_SAME(create<T>(MAX) / create<T>(TWO), (create<T>(MAX) >> 1));
-
-  using word_type = typename T::word_type;
-  const T zero_one_repeated = T::all_ones() / T(0xff);
-  const word_type pattern = word_type(~0) / word_type(0xff);
-  for (const word_type part : zero_one_repeated.val) {
-    if constexpr (T::SIGNED == false) {
-      EXPECT_EQ(part, pattern);
-    }
-  }
-}
-
-TYPED_TEST(LlvmLibcUIntClassTest, is_neg, Types) {
-  EXPECT_FALSE(create<T>(ZERO).is_neg());
-  EXPECT_FALSE(create<T>(ONE).is_neg());
-  EXPECT_FALSE(create<T>(TWO).is_neg());
-  EXPECT_EQ(create<T>(MIN).is_neg(), T::SIGNED);
-  EXPECT_FALSE(create<T>(MAX).is_neg());
-}
-
-TYPED_TEST(LlvmLibcUIntClassTest, Masks, Types) {
-  if constexpr (!T::SIGNED) {
-    constexpr size_t BITS = T::BITS;
-    // mask_trailing_ones
-    ASSERT_SAME((mask_trailing_ones<T, 0>()), T::zero());
-    ASSERT_SAME((mask_trailing_ones<T, 1>()), T::one());
-    ASSERT_SAME((mask_trailing_ones<T, BITS - 1>()), T::all_ones() >> 1);
-    ASSERT_SAME((mask_trailing_ones<T, BITS>()), T::all_ones());
-    // mask_leading_ones
-    ASSERT_SAME((mask_leading_ones<T, 0>()), T::zero());
-    ASSERT_SAME((mask_leading_ones<T, 1>()), T::one() << (BITS - 1));
-    ASSERT_SAME((mask_leading_ones<T, BITS - 1>()), T::all_ones() - T::one());
-    ASSERT_SAME((mask_leading_ones<T, BITS>()), T::all_ones());
-    // mask_trailing_zeros
-    ASSERT_SAME((mask_trailing_zeros<T, 0>()), T::all_ones());
-    ASSERT_SAME((mask_trailing_zeros<T, 1>()), T::all_ones() - T::one());
-    ASSERT_SAME((mask_trailing_zeros<T, BITS - 1>()), T::one() << (BITS - 1));
-    ASSERT_SAME((mask_trailing_zeros<T, BITS>()), T::zero());
-    // mask_trailing_zeros
-    ASSERT_SAME((mask_leading_zeros<T, 0>()), T::all_ones());
-    ASSERT_SAME((mask_leading_zeros<T, 1>()), T::all_ones() >> 1);
-    ASSERT_SAME((mask_leading_zeros<T, BITS - 1>()), T::one());
-    ASSERT_SAME((mask_leading_zeros<T, BITS>()), T::zero());
-  }
-}
-
-TYPED_TEST(LlvmLibcUIntClassTest, CountBits, Types) {
-  if constexpr (!T::SIGNED) {
-    for (size_t i = 0; i <= T::BITS; ++i) {
-      const auto l_one = T::all_ones() << i; // 0b111...000
-      const auto r_one = T::all_ones() >> i; // 0b000...111
-      const int zeros = i;
-      const int ones = T::BITS - zeros;
-      ASSERT_EQ(cpp::countr_one(r_one), ones);
-      ASSERT_EQ(cpp::countl_one(l_one), ones);
-      ASSERT_EQ(cpp::countr_zero(l_one), zeros);
-      ASSERT_EQ(cpp::countl_zero(r_one), zeros);
-    }
-  }
-}
-
 using LL_UInt64 = UInt<64>;
 // We want to test UInt<128> explicitly. So, for
 // convenience, we use a sugar which does not conflict with the UInt128 type
@@ -751,7 +561,7 @@ TEST(LlvmLibcUIntClassTest, FullMulTests) {
     LL_UInt##Bits a = ~LL_UInt##Bits(0);                                       \
     LL_UInt##Bits hi = a.quick_mul_hi(a);                                      \
     LL_UInt##Bits trunc = static_cast<LL_UInt##Bits>(a.ful_mul(a) >> Bits);    \
-    uint64_t overflow = trunc.sub_overflow(hi);                                \
+    uint64_t overflow = trunc.sub(hi);                                         \
     EXPECT_EQ(overflow, uint64_t(0));                                          \
     EXPECT_LE(uint64_t(trunc), uint64_t(Error));                               \
   } while (0)
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
index c0d402a89ea3..4f976122967c 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
@@ -87,7 +87,6 @@ libc_test(
     srcs = ["uint_test.cpp"],
     deps = [
         "//libc:__support_cpp_optional",
-        "//libc:__support_integer_literals",
         "//libc:__support_macros_properties_types",
         "//libc:__support_uint",
         "//libc:llvm_libc_macros_math_macros",
-- 
cgit v1.2.3


From 61efea7142e904e6492e1ce0566ec23d9d221c1e Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 4 Apr 2024 10:12:33 +0100
Subject: [libclc] Fix a couple of issues preventing in-tree builds (#87505)

libclc is mentioned in the list of LLVM_ENABLE_PROJECTS but it isn't
actually possible to build it in-tree for various reasons. Users
currently have to build it via LLVM_ENABLE_EXTERNAL_PROJECTS, which
isn't very well documented.

We can't properly build in-tree because the current system needs to
"see" clang and other tools at CMake configuration time. The general
idea is that we could fix this in the future by moving the compilation
and linking of bitcode libraries to custom commands, which would remove
the dependency on CMake configuration and would allow us to build libclc
after clang and other tools are built in-tree. Since that's a bigger
change, it is being left for later.

Note that with this commit it's *still* not possible to properly build
in-tree - this commit just fixes a few little things that are in the
way. We are now able to build in-tree in the sense that it can be built
as a regular LLVM sub-project, but the tools it uses to compile the
libraries are still picked up from a pre-existing installation of LLVM,
and not from tools built during the same build as libclc.

The things fixed by this commit include:

* Its use of CMAKE_SOURCE_DIR (i.e., assuming it was the top-level
project)
* These have been converted to PROJECT_SOURCE_DIR - should have no
consequences for out-of-tree builds.
* Its prepare_builtins tool insisting on linking against the dynamic
LLVM.so.
* This has been turned from an "llvm executable" into an "llvm utility"
which links against the static libraries.
  * It was also missing a link component for the IRReader library.
* Assuming an output path for its builtin libraries (dependent on the
working directory)
* This has been changed to query CMake for the library target's output
file.
* The spirv-mesa3d and spirv64-mesa3d targets were enabled by default
(or when asking to build 'all' libclc targets), when they require
llvm-spirv as an external dependency.
* They are now only built when the user explicitly asks for them, or
when llvm-spirv is available and the user asks for 'all'.
---
 libclc/CMakeLists.txt | 102 +++++++++++++++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 46 deletions(-)

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 9236f09d3667..2d000cf5cad8 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -20,21 +20,6 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
   spirv64/lib/SOURCES
 )
 
-# List of all targets
-set( LIBCLC_TARGETS_ALL
-  amdgcn--
-  amdgcn--amdhsa
-  clspv--
-  clspv64--
-  r600--
-  nvptx--
-  nvptx64--
-  nvptx--nvidiacl
-  nvptx64--nvidiacl
-  spirv-mesa3d-
-  spirv64-mesa3d-
-)
-
 set( LIBCLC_MIN_LLVM "3.9.0" )
 
 set( LIBCLC_TARGETS_TO_BUILD "all"
@@ -51,15 +36,6 @@ if( ${LLVM_PACKAGE_VERSION} VERSION_LESS ${LIBCLC_MIN_LLVM} )
   message( FATAL_ERROR "libclc needs at least LLVM ${LIBCLC_MIN_LLVM}" )
 endif()
 
-# mesa3d environment is only available since LLVM 4.0
-if( ${LLVM_PACKAGE_VERSION} VERSION_GREATER "3.9.0" )
-  set( LIBCLC_TARGETS_ALL ${LIBCLC_TARGETS_ALL} amdgcn-mesa-mesa3d )
-endif()
-
-if( LIBCLC_TARGETS_TO_BUILD STREQUAL "all" )
-  set( LIBCLC_TARGETS_TO_BUILD ${LIBCLC_TARGETS_ALL} )
-endif()
-
 find_program( LLVM_CLANG clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
 find_program( LLVM_AS llvm-as PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
 find_program( LLVM_LINK llvm-link PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
@@ -76,15 +52,45 @@ if( NOT LLVM_CLANG OR NOT LLVM_OPT OR NOT LLVM_AS OR NOT LLVM_LINK )
   message( FATAL_ERROR "libclc toolchain incomplete!" )
 endif()
 
+# List of all targets. Note that some are added dynamically below.
+set( LIBCLC_TARGETS_ALL
+  amdgcn--
+  amdgcn--amdhsa
+  clspv--
+  clspv64--
+  r600--
+  nvptx--
+  nvptx64--
+  nvptx--nvidiacl
+  nvptx64--nvidiacl
+)
+
+# mesa3d environment is only available since LLVM 4.0
+if( ${LLVM_PACKAGE_VERSION} VERSION_GREATER "3.9.0" )
+  list( APPEND LIBCLC_TARGETS_ALL amdgcn-mesa-mesa3d )
+endif()
+
+# spirv-mesa3d and spirv64-mesa3d targets can only be built with the (optional)
+# llvm-spirv external tool.
+if( LLVM_SPIRV )
+  list( APPEND LIBCLC_TARGETS_ALL  spirv-mesa3d- spirv64-mesa3d- )
+endif()
+
+if( LIBCLC_TARGETS_TO_BUILD STREQUAL "all" )
+  set( LIBCLC_TARGETS_TO_BUILD ${LIBCLC_TARGETS_ALL} )
+endif()
+
 list( SORT LIBCLC_TARGETS_TO_BUILD )
 
+# Verify that the user hasn't requested mesa3d targets without an available
+# llvm-spirv tool.
 if( "spirv-mesa3d-" IN_LIST LIBCLC_TARGETS_TO_BUILD OR "spirv64-mesa3d-" IN_LIST LIBCLC_TARGETS_TO_BUILD )
   if( NOT LLVM_SPIRV )
     message( FATAL_ERROR "SPIR-V targets requested, but spirv-tools is not installed" )
   endif()
 endif()
 
-set( CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake )
+set( CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake )
 set( CMAKE_CLC_COMPILER ${LLVM_CLANG} )
 set( CMAKE_CLC_ARCHIVE ${LLVM_LINK} )
 set( CMAKE_LLAsm_PREPROCESSOR ${LLVM_CLANG} )
@@ -113,9 +119,10 @@ set(LLVM_LINK_COMPONENTS
   BitReader
   BitWriter
   Core
+  IRReader
   Support
 )
-add_llvm_executable( prepare_builtins utils/prepare-builtins.cpp )
+add_llvm_utility( prepare_builtins utils/prepare-builtins.cpp )
 target_compile_definitions( prepare_builtins PRIVATE ${LLVM_VERSION_DEFINE} )
 # These were not properly reported in early LLVM and we don't need them
 target_compile_options( prepare_builtins PRIVATE -fno-rtti -fno-exceptions )
@@ -165,7 +172,7 @@ if( ENABLE_RUNTIME_SUBNORMAL )
 endif()
 
 find_package( Python3 REQUIRED COMPONENTS Interpreter )
-file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/generic/lib/gen_convert.py script_loc )
+file( TO_CMAKE_PATH ${PROJECT_SOURCE_DIR}/generic/lib/gen_convert.py script_loc )
 add_custom_command(
   OUTPUT convert.cl
   COMMAND ${Python3_EXECUTABLE} ${script_loc} > convert.cl
@@ -210,7 +217,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
   foreach( l ${dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} )
     foreach( s "SOURCES" "SOURCES_${LLVM_MAJOR}.${LLVM_MINOR}" )
       file( TO_CMAKE_PATH ${l}/lib/${s} file_loc )
-      file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/${file_loc} loc )
+      file( TO_CMAKE_PATH ${PROJECT_SOURCE_DIR}/${file_loc} loc )
       # Prepend the location to give higher priority to
       # specialized implementation
       if( EXISTS ${loc} )
@@ -246,7 +253,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
         list( APPEND objects ${f} )
         list( APPEND rel_files ${dir}/${f} )
         # FIXME: This should really go away
-        file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/${dir}/${f} src_loc )
+        file( TO_CMAKE_PATH ${PROJECT_SOURCE_DIR}/${dir}/${f} src_loc )
         get_filename_component( fdir ${src_loc} DIRECTORY )
 
         set_source_files_properties( ${dir}/${f}
@@ -288,53 +295,56 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       set( opt_flags -O3 )
     endif()
 
-    add_library( builtins.link.${arch_suffix} STATIC ${rel_files} )
+    set( builtins_link_lib_tgt builtins.link.${arch_suffix} )
+    add_library( ${builtins_link_lib_tgt} STATIC ${rel_files} )
     # Make sure we depend on the pseudo target to prevent
     # multiple invocations
-    add_dependencies( builtins.link.${arch_suffix} generate_convert.cl )
-    add_dependencies( builtins.link.${arch_suffix} clspv-generate_convert.cl )
+    add_dependencies( ${builtins_link_lib_tgt} generate_convert.cl )
+    add_dependencies( ${builtins_link_lib_tgt} clspv-generate_convert.cl )
     # CMake will turn this include into absolute path
-    target_include_directories( builtins.link.${arch_suffix} PRIVATE
+    target_include_directories( ${builtins_link_lib_tgt} PRIVATE
       "generic/include" )
-    target_compile_definitions( builtins.link.${arch_suffix} PRIVATE
+    target_compile_definitions( ${builtins_link_lib_tgt} PRIVATE
       "__CLC_INTERNAL" )
     string( TOUPPER "-DCLC_${ARCH}" CLC_TARGET_DEFINE )
-    target_compile_definitions( builtins.link.${arch_suffix} PRIVATE
+    target_compile_definitions( ${builtins_link_lib_tgt} PRIVATE
       ${CLC_TARGET_DEFINE} )
-    target_compile_options( builtins.link.${arch_suffix} PRIVATE  -target
+    target_compile_options( ${builtins_link_lib_tgt} PRIVATE  -target
       ${t} ${mcpu} -fno-builtin -nostdlib ${build_flags} )
-    set_target_properties( builtins.link.${arch_suffix} PROPERTIES
+    set_target_properties( ${builtins_link_lib_tgt} PROPERTIES
       LINKER_LANGUAGE CLC )
 
     set( obj_suffix ${arch_suffix}.bc )
+    set( builtins_opt_lib_tgt builtins.opt.${obj_suffix} )
 
     # Add opt target
-    add_custom_command( OUTPUT "builtins.opt.${obj_suffix}"
-      COMMAND ${LLVM_OPT} ${opt_flags} -o "builtins.opt.${obj_suffix}" "builtins.link.${obj_suffix}"
-      DEPENDS "builtins.link.${arch_suffix}" )
+    add_custom_command( OUTPUT ${builtins_opt_lib_tgt}
+      COMMAND ${LLVM_OPT} ${opt_flags} -o ${builtins_opt_lib_tgt}
+        $<TARGET_FILE:${builtins_link_lib_tgt}>
+      DEPENDS ${builtins_link_lib_tgt} )
     add_custom_target( "opt.${obj_suffix}" ALL
-      DEPENDS "builtins.opt.${obj_suffix}" )
+      DEPENDS ${builtins_opt_lib_tgt} )
 
     if( ${ARCH} STREQUAL "spirv" OR ${ARCH} STREQUAL "spirv64" )
       set( spv_suffix ${arch_suffix}.spv )
       add_custom_command( OUTPUT "${spv_suffix}"
-        COMMAND ${LLVM_SPIRV} ${spvflags} -o "${spv_suffix}" "builtins.link.${obj_suffix}"
-        DEPENDS "builtins.link.${arch_suffix}" )
+        COMMAND ${LLVM_SPIRV} ${spvflags} -o "${spv_suffix}" ${builtins_opt_lib_tgt}
+        DEPENDS ${builtins_link_lib_tgt} )
       add_custom_target( "prepare-${spv_suffix}" ALL DEPENDS "${spv_suffix}" )
       install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix}
          DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
     else()
       # Add prepare target
       add_custom_command( OUTPUT "${obj_suffix}"
-        COMMAND prepare_builtins -o "${obj_suffix}" "builtins.opt.${obj_suffix}"
-        DEPENDS "opt.${obj_suffix}" "builtins.opt.${obj_suffix}" prepare_builtins )
+        COMMAND prepare_builtins -o "${obj_suffix}" ${builtins_opt_lib_tgt}
+        DEPENDS "opt.${obj_suffix}" ${builtins_opt_lib_tgt} prepare_builtins )
       add_custom_target( "prepare-${obj_suffix}" ALL DEPENDS "${obj_suffix}" )
 
       # nvptx-- targets don't include workitem builtins
       if( NOT ${t} MATCHES ".*ptx.*--$" )
         add_test( NAME external-calls-${obj_suffix}
           COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR}
-          WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} )
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} )
       endif()
 
       install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
-- 
cgit v1.2.3


From 3cf539fb046457a444e93cefc87cca10cbd3b807 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 4 Apr 2024 10:14:16 +0100
Subject: [AMDGPU] Combine or remove redundant waitcnts at the end of each MBB
 (#87539)

Call generateWaitcnt unconditionally at the end of
SIInsertWaitcnts::insertWaitcntInBlock. Even if we don't need to
generate a new waitcnt instruction it has the effect of combining or
removing redundant waitcnts that were already present. Tests show
various small improvements in waitcnt placement.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp        | 45 ++++-------
 .../AMDGPU/GlobalISel/divergent-control-flow.ll    |  4 -
 .../AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll    |  2 -
 .../AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll     |  2 -
 .../AMDGPU/atomic_optimizations_local_pointer.ll   | 86 ++++++++++------------
 llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll       |  2 +-
 .../CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll  |  2 +-
 .../test/CodeGen/AMDGPU/extract-subvector-16bit.ll | 11 ---
 llvm/test/CodeGen/AMDGPU/function-args.ll          |  1 -
 .../CodeGen/AMDGPU/lds-global-non-entry-func.ll    | 28 +++----
 llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll       |  8 +-
 llvm/test/CodeGen/AMDGPU/skip-if-dead.ll           |  4 -
 .../transform-block-with-return-to-epilog.ll       |  4 +-
 13 files changed, 73 insertions(+), 126 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 27621906e4c5..bb499c5c8c57 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -708,9 +708,6 @@ public:
                                  WaitcntBrackets &ScoreBrackets,
                                  MachineInstr *OldWaitcntInstr,
                                  bool FlushVmCnt);
-  bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
-                               WaitcntBrackets &ScoreBrackets,
-                               MachineInstr *OldWaitcntInstr);
   bool generateWaitcnt(AMDGPU::Waitcnt Wait,
                        MachineBasicBlock::instr_iterator It,
                        MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
@@ -1902,31 +1899,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
                          OldWaitcntInstr);
 }
 
-// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
-// end of the given block if needed.
-bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
-                                               WaitcntBrackets &ScoreBrackets,
-                                               MachineInstr *OldWaitcntInstr) {
-  AMDGPU::Waitcnt Wait;
-
-  unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
-  unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
-  unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
-
-  if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
-    return false;
-
-  if (LoadCntPending != 0)
-    Wait.LoadCnt = 0;
-  if (SampleCntPending != 0)
-    Wait.SampleCnt = 0;
-  if (BvhCntPending != 0)
-    Wait.BvhCnt = 0;
-
-  return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
-                         OldWaitcntInstr);
-}
-
 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
                                        MachineBasicBlock::instr_iterator It,
                                        MachineBasicBlock &Block,
@@ -2355,9 +2327,22 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     ++Iter;
   }
 
+  // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
+  // needed.
+  AMDGPU::Waitcnt Wait;
   if (Block.getFirstTerminator() == Block.end() &&
-      isPreheaderToFlush(Block, ScoreBrackets))
-    Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
+      isPreheaderToFlush(Block, ScoreBrackets)) {
+    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+      Wait.LoadCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+      Wait.SampleCnt = 0;
+    if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+      Wait.BvhCnt = 0;
+  }
+
+  // Combine or remove any redundant waitcnts at the end of the block.
+  Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
+                              OldWaitcntInstr);
 
   return Modified;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index c25b0f212826..78d908455e01 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -16,7 +16,6 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB0_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -44,7 +43,6 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB1_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -74,7 +72,6 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB2_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = trunc i32 %value to i1
@@ -106,7 +103,6 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB3_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %value = load i32, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
index 303dc46e2c88..5c22d5bdcf74 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
index 63702d258757..e005c38355a3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 352adacbda3e..af6f69130910 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -65,11 +65,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB0_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -92,11 +92,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB0_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -253,8 +253,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -504,11 +504,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB2_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -544,11 +544,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB2_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_add_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -944,7 +944,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -952,6 +951,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -974,7 +974,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB4_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1006,7 +1005,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB4_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1219,11 +1217,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB5_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
@@ -1258,11 +1256,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB5_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1530,10 +1528,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -1557,12 +1555,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB7_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1585,12 +1583,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB7_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1751,8 +1749,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2006,11 +2004,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB9_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2046,11 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB9_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2446,7 +2444,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -2454,6 +2451,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -2477,7 +2475,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB11_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
@@ -2487,6 +2484,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2509,7 +2507,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB11_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
@@ -2519,6 +2516,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3081,11 +3079,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB14_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3121,11 +3119,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB14_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3355,11 +3353,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB15_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_or_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3395,11 +3393,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB15_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_or_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3629,11 +3627,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB16_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3669,11 +3667,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB16_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3903,11 +3901,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB17_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_max_i32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3943,11 +3941,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB17_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_max_i32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4151,7 +4149,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
@@ -4162,6 +4159,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -4182,7 +4180,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB18_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4216,7 +4213,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB18_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4419,11 +4415,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB19_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_min_i32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -4459,11 +4455,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB19_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_min_i32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4667,7 +4663,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
@@ -4678,6 +4673,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -4698,7 +4694,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB20_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4732,7 +4727,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB20_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4935,11 +4929,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB21_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_max_u32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -4975,11 +4969,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB21_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_max_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5183,7 +5177,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
@@ -5193,6 +5186,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -5214,7 +5208,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB22_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -5226,6 +5219,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5246,7 +5240,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB22_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -5258,6 +5251,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5446,11 +5440,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB23_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_min_u32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5486,11 +5480,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB23_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_min_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5694,7 +5688,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5704,6 +5697,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -5725,7 +5719,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB24_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5737,6 +5730,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5757,7 +5751,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB24_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5769,6 +5762,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 19a1d2d9dbd1..c9076a9541b2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -186,7 +186,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:  .LBB1_8: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index ac50fb86c96f..da609bfa8ede 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add
 ; GCN-NEXT:  .LBB0_2: ; %endif
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x3d0000
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_store_dword v1, v0, s[0:1] offset:2300
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 069c57e2ae63..6dabd8c0b83e 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -103,7 +103,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB0_4: ; %exit
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
@@ -131,7 +130,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB0_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -266,7 +264,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB1_4: ; %exit
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
@@ -294,7 +291,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB1_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -431,7 +427,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:  .LBB2_4: ; %exit
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3800
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -461,7 +456,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB2_4: ; %exit
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -665,7 +659,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB3_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -871,7 +864,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB4_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1081,7 +1073,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB5_4: ; %exit
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -1432,7 +1423,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB7_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
@@ -1724,7 +1714,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB8_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index db89ad66ffab..3b2f15c8340a 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -114,7 +114,6 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:  .LBB3_2: ; %bb2
 ; CIGFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_arg_i1_use:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 433a836e7ca0..3b3e107a6296 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -33,7 +33,7 @@ define void @func_use_lds_global() {
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -103,7 +103,7 @@ define void @func_use_lds_global_constexpr_cast() {
 ; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_trap 2
@@ -171,7 +171,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX8-SDAG-NEXT:  ; %bb.1: ; %bb1
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -181,7 +181,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX8-SDAG-NEXT:  ; %bb.3: ; %bb0
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -189,7 +189,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:  .LBB2_4: ; %ret
 ; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -379,7 +379,7 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v2
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -472,7 +472,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:  ; %bb.1: ; %use.bb
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -481,7 +481,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:  .LBB4_2: ; %ret
 ; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: func_uses_lds_phi_after:
@@ -506,7 +505,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-GISEL-NEXT:  .LBB4_2: ; %ret
 ; GFX8-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: func_uses_lds_phi_after:
@@ -527,7 +526,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB4_2: ; %ret
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: func_uses_lds_phi_after:
@@ -548,7 +547,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:  .LBB4_2: ; %ret
 ; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-LABEL: func_uses_lds_phi_after:
@@ -570,7 +569,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:  .LBB4_3: ; %ret
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ; SDAG-NEXT:  .LBB4_4:
 ; SDAG-NEXT:    s_endpgm
@@ -594,7 +593,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:  .LBB4_3: ; %ret
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB4_4:
 ; GISEL-NEXT:    s_endpgm
@@ -616,6 +615,3 @@ ret:
 ; CHECK: {{.*}}
 ; GFX8: {{.*}}
 ; GFX9: {{.*}}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 5e76dfd9bddd..4477f028c6d2 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -157,7 +157,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:  .LBB2_2:
 ; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; VI-NEXT:    s_mov_b64 s[6:7], exec
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_readfirstlane_b32 s8, v1
 ; VI-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
 ; VI-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -203,15 +202,14 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:  ; %bb.7:
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_add_rtn_f32 v2, v2, v1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:  .LBB2_8:
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_readfirstlane_b32 s2, v2
 ; VI-NEXT:    v_add_f32_e32 v2, s2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -240,7 +238,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX9-NEXT:  .LBB2_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -285,16 +282,15 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_8
 ; GFX9-NEXT:  ; %bb.7:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_rtn_f32 v2, v2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB2_8:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 138dd53b3ede..d19ef75cb08c 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1260,8 +1260,6 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB11_5: ; %end
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB11_6:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
@@ -1525,8 +1523,6 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB13_5: ; %UnifiedReturnBlock
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB13_6:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index eef5f57beb07..ecebbb9ac874 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -32,7 +32,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT:   S_WAITCNT 3952
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
 entry:
@@ -79,7 +79,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a,
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT:   S_WAITCNT 3952
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.5:
 entry:
-- 
cgit v1.2.3


From 708ce8569067c2aabd3cc669b0db90f23e53b3b0 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 4 Apr 2024 10:20:14 +0100
Subject: [RemoveDIs][NFC] Use ScopedDbgInfoFormatSetter in more places
 (#87380)

The class `ScopedDbgInfoFormatSetter` was added as a convenient way to
temporarily change the debug info format of a function or module, as
part of IR printing; since this process is repeated in a number of other
places, this patch uses the format-setter class in those places as well.
---
 llvm/include/llvm/IR/DebugProgramInstruction.h   | 19 +++++++++++++++++++
 llvm/include/llvm/IR/PassManager.h               | 24 +-----------------------
 llvm/include/llvm/IR/PrintPasses.h               | 19 -------------------
 llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp    | 17 ++++-------------
 llvm/lib/CodeGen/MIRPrinter.cpp                  | 20 ++++++--------------
 llvm/lib/IR/LegacyPassManager.cpp                |  7 +------
 llvm/lib/Linker/IRMover.cpp                      | 21 +++------------------
 llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp |  9 ++-------
 llvm/tools/llvm-dis/llvm-dis.cpp                 |  6 +-----
 9 files changed, 37 insertions(+), 105 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index c9477131c09c..9f4987493739 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -659,6 +659,25 @@ getDbgRecordRange(DbgMarker *DebugMarker) {
 
 DEFINE_ISA_CONVERSION_FUNCTIONS(DbgRecord, LLVMDbgRecordRef)
 
+/// Used to temporarily set the debug info format of a function, module, or
+/// basic block for the duration of this object's lifetime, after which the
+/// prior state will be restored.
+template <typename T> class ScopedDbgInfoFormatSetter {
+  T &Obj;
+  bool OldState;
+
+public:
+  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
+    Obj.setIsNewDbgInfoFormat(NewState);
+  }
+  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
+};
+
+template <typename T>
+ScopedDbgInfoFormatSetter(T &Obj,
+                          bool NewState) -> ScopedDbgInfoFormatSetter<T>;
+
 } // namespace llvm
 
 #endif // LLVM_IR_DEBUGPROGRAMINSTRUCTION_H
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index 108465478d37..d701481202f8 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -64,23 +64,6 @@ extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
 
 namespace llvm {
 
-// RemoveDIs: Provide facilities for converting debug-info from one form to
-// another, which are no-ops for everything but modules.
-template <class IRUnitT> inline bool shouldConvertDbgInfo(IRUnitT &IR) {
-  return false;
-}
-template <> inline bool shouldConvertDbgInfo(Module &IR) {
-  return !IR.IsNewDbgInfoFormat && UseNewDbgInfoFormat;
-}
-template <class IRUnitT> inline void doConvertDbgInfoToNew(IRUnitT &IR) {}
-template <> inline void doConvertDbgInfoToNew(Module &IR) {
-  IR.convertToNewDbgValues();
-}
-template <class IRUnitT> inline void doConvertDebugInfoToOld(IRUnitT &IR) {}
-template <> inline void doConvertDebugInfoToOld(Module &IR) {
-  IR.convertFromNewDbgValues();
-}
-
 // Forward declare the analysis manager template.
 template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
 
@@ -229,9 +212,7 @@ public:
 
     // RemoveDIs: if requested, convert debug-info to DbgRecord representation
     // for duration of these passes.
-    bool ShouldConvertDbgInfo = shouldConvertDbgInfo(IR);
-    if (ShouldConvertDbgInfo)
-      doConvertDbgInfoToNew(IR);
+    ScopedDbgInfoFormatSetter FormatSetter(IR, UseNewDbgInfoFormat);
 
     for (auto &Pass : Passes) {
       // Check the PassInstrumentation's BeforePass callbacks before running the
@@ -255,9 +236,6 @@ public:
       PA.intersect(std::move(PassPA));
     }
 
-    if (ShouldConvertDbgInfo)
-      doConvertDebugInfoToOld(IR);
-
     // Invalidation was handled after each pass in the above loop for the
     // current unit of IR. Therefore, the remaining analysis results in the
     // AnalysisManager are preserved. We mark this with a set so that we don't
diff --git a/llvm/include/llvm/IR/PrintPasses.h b/llvm/include/llvm/IR/PrintPasses.h
index 3803bd05cbe5..95b97e76c867 100644
--- a/llvm/include/llvm/IR/PrintPasses.h
+++ b/llvm/include/llvm/IR/PrintPasses.h
@@ -78,25 +78,6 @@ std::string doSystemDiff(StringRef Before, StringRef After,
                          StringRef OldLineFormat, StringRef NewLineFormat,
                          StringRef UnchangedLineFormat);
 
-/// Used to temporarily set the debug info format of a function, module, or
-/// basic block for the duration of this object's lifetime, after which the
-/// prior state will be restored.
-template <typename T> class ScopedDbgInfoFormatSetter {
-  T &Obj;
-  bool OldState;
-
-public:
-  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
-    Obj.setIsNewDbgInfoFormat(NewState);
-  }
-  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
-};
-
-template <typename T>
-ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-    -> ScopedDbgInfoFormatSetter<T>;
-
 } // namespace llvm
 
 #endif // LLVM_IR_PRINTPASSES_H
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index de2396f31f66..4f2486c963e9 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -21,19 +21,14 @@ using namespace llvm;
 extern bool WriteNewDbgInfoFormatToBitcode;
 
 PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
-  bool ConvertToOldDbgFormatForWrite =
-      M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat &&
+                                                WriteNewDbgInfoFormatToBitcode);
 
   const ModuleSummaryIndex *Index =
       EmitSummaryIndex ? &(AM.getResult<ModuleSummaryIndexAnalysis>(M))
                        : nullptr;
   WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, Index, EmitModuleHash);
 
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
 
@@ -57,16 +52,12 @@ namespace {
     StringRef getPassName() const override { return "Bitcode Writer"; }
 
     bool runOnModule(Module &M) override {
-      bool ConvertToOldDbgFormatForWrite =
-          M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
-      if (ConvertToOldDbgFormatForWrite)
-        M.convertFromNewDbgValues();
+      ScopedDbgInfoFormatSetter FormatSetter(
+          M, M.IsNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode);
 
       WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, /*Index=*/nullptr,
                          /*EmitModuleHash=*/false);
 
-      if (ConvertToOldDbgFormatForWrite)
-        M.convertToNewDbgValues();
       return false;
     }
     void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bbc6d39d17fc..bf3aee67ec00 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -69,6 +69,8 @@ static cl::opt<bool> SimplifyMIR(
 static cl::opt<bool> PrintLocations("mir-debug-loc", cl::Hidden, cl::init(true),
                                     cl::desc("Print MIR debug-locations"));
 
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 namespace {
 
 /// This structure describes how to print out stack object references.
@@ -986,29 +988,19 @@ void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
 }
 
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
-  // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
-  // in dbg.value format.
-  bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
-    const_cast<Module &>(M).convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(const_cast<Module &>(M),
+                                         WriteNewDbgInfoFormat);
 
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
-
-  if (IsNewDbgInfoFormat)
-    const_cast<Module &>(M).convertToNewDbgValues();
 }
 
 void llvm::printMIR(raw_ostream &OS, const MachineFunction &MF) {
   // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
   // in dbg.value format.
-  bool IsNewDbgInfoFormat = MF.getFunction().IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
-    const_cast<Function &>(MF.getFunction()).convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(
+      const_cast<Function &>(MF.getFunction()), WriteNewDbgInfoFormat);
 
   MIRPrinter Printer(OS);
   Printer.print(MF);
-
-  if (IsNewDbgInfoFormat)
-    const_cast<Function &>(MF.getFunction()).convertToNewDbgValues();
 }
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 953f21ce7405..d361bd9a9839 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -531,9 +531,7 @@ bool PassManagerImpl::run(Module &M) {
   // RemoveDIs: if a command line flag is given, convert to the
   // DbgVariableRecord representation of debug-info for the duration of these
   // passes.
-  bool shouldConvertDbgInfo = UseNewDbgInfoFormat && !M.IsNewDbgInfoFormat;
-  if (shouldConvertDbgInfo)
-    M.convertToNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(M, UseNewDbgInfoFormat);
 
   for (ImmutablePass *ImPass : getImmutablePasses())
     Changed |= ImPass->doInitialization(M);
@@ -547,9 +545,6 @@ bool PassManagerImpl::run(Module &M) {
   for (ImmutablePass *ImPass : getImmutablePasses())
     Changed |= ImPass->doFinalization(M);
 
-  if (shouldConvertDbgInfo)
-    M.convertFromNewDbgValues();
-
   return Changed;
 }
 } // namespace legacy
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index a7e6db82e5c2..7a5aa0c80478 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1548,25 +1548,10 @@ Error IRLinker::run() {
       return Err;
 
   // Convert source module to match dest for the duration of the link.
-  bool SrcModuleNewDbgFormat = SrcM->IsNewDbgInfoFormat;
-  if (DstM.IsNewDbgInfoFormat != SrcM->IsNewDbgInfoFormat) {
-    if (DstM.IsNewDbgInfoFormat)
-      SrcM->convertToNewDbgValues();
-    else
-      SrcM->convertFromNewDbgValues();
-  }
-  // Undo debug mode conversion afterwards.
-  auto Cleanup = make_scope_exit([&]() {
-    if (SrcModuleNewDbgFormat != SrcM->IsNewDbgInfoFormat) {
-      if (SrcModuleNewDbgFormat)
-        SrcM->convertToNewDbgValues();
-      else
-        SrcM->convertFromNewDbgValues();
-    }
-  });
+  ScopedDbgInfoFormatSetter FormatSetter(*SrcM, DstM.IsNewDbgInfoFormat);
 
-  // Inherit the target data from the source module if the destination module
-  // doesn't have one already.
+  // Inherit the target data from the source module if the destination
+  // module doesn't have one already.
   if (DstM.getDataLayout().isDefault())
     DstM.setDataLayout(SrcM->getDataLayout());
 
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 3986359b6a5a..4df18c824927 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -583,10 +583,8 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
 
   // RemoveDIs: there's no bitcode representation of the DbgVariableRecord
   // debug-info, convert to dbg.values before writing out.
-  bool ConvertToOldDbgFormatForWrite =
-      M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertFromNewDbgValues();
+  ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat &&
+                                                WriteNewDbgInfoFormatToBitcode);
 
   bool Changed = writeThinLTOBitcode(
       OS, ThinLinkOS,
@@ -595,8 +593,5 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
       },
       M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
 
-  if (ConvertToOldDbgFormatForWrite)
-    M.convertToNewDbgValues();
-
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 49154dc46c57..6ad1c99568e4 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -258,12 +258,8 @@ int main(int argc, char **argv) {
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
         if (M) {
-          bool ChangeDbgFormat = M->IsNewDbgInfoFormat != WriteNewDbgInfoFormat;
-          if (ChangeDbgFormat)
-            M->setIsNewDbgInfoFormat(WriteNewDbgInfoFormat);
+          ScopedDbgInfoFormatSetter FormatSetter(*M, WriteNewDbgInfoFormat);
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
-          if (ChangeDbgFormat)
-            M->setIsNewDbgInfoFormat(!WriteNewDbgInfoFormat);
         }
         if (Index)
           Index->print(Out->os());
-- 
cgit v1.2.3


From 7a8cf951b3bdc60feac412200ab9661e009d44ae Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Thu, 4 Apr 2024 09:52:48 +0100
Subject: AArch64-Darwin: allow -mcmodel=large with (default) PIC

Darwin targets implement -mcmodel=large by forcing all global accesses to use
the GOT, instead of the ELF movz/movk sequence. That means it's compatible with
PIC so the Clang driver shouldn't reject the option.
---
 clang/lib/Driver/ToolChains/Clang.cpp | 3 ++-
 clang/test/Driver/mcmodel.c           | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b7ec7e0a6097..766a9b91e3c0 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5882,7 +5882,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CM = "large";
     if (Triple.isAArch64(64)) {
       Ok = CM == "tiny" || CM == "small" || CM == "large";
-      if (CM == "large" && RelocationModel != llvm::Reloc::Static)
+      if (CM == "large" && !Triple.isOSBinFormatMachO() &&
+          RelocationModel != llvm::Reloc::Static)
         D.Diag(diag::err_drv_argument_only_allowed_with)
             << A->getAsString(Args) << "-fno-pic";
     } else if (Triple.isLoongArch()) {
diff --git a/clang/test/Driver/mcmodel.c b/clang/test/Driver/mcmodel.c
index 1eb6ae16ff47..9681c32579d7 100644
--- a/clang/test/Driver/mcmodel.c
+++ b/clang/test/Driver/mcmodel.c
@@ -11,6 +11,7 @@
 // RUN: FileCheck --check-prefix=AIX-MCMEDIUM-OVERRIDE %s < %t.log
 // RUN: not %clang -### -c -mcmodel=lager %s 2>&1 | FileCheck --check-prefix=INVALID %s
 // RUN: %clang --target=aarch64 -### -S -mcmodel=large -fno-pic %s 2>&1 | FileCheck --check-prefix=LARGE %s
+// RUN: %clang --target=aarch64-apple-macosx -### -S -mcmodel=large %s 2>&1 | FileCheck --check-prefix=LARGE %s
 // RUN: not %clang --target=aarch64 -### -S -mcmodel=large -fpic %s 2>&1 | FileCheck --check-prefix=AARCH64-PIC-LARGE %s
 // RUN: not %clang -### -c --target=aarch64 -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=ERR-MEDIUM %s
 // RUN: not %clang -### -c --target=aarch64 -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=ERR-KERNEL %s
-- 
cgit v1.2.3


From cca9115b1c640a307b510821cb84a1f3bb76b969 Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev@accesssoftek.com>
Date: Thu, 4 Apr 2024 12:38:09 +0300
Subject: [lld][AArch64][ELF][PAC] Support AUTH relocations and AUTH ELF
 marking (#72714)

This patch adds lld support for:

- Dynamic R_AARCH64_AUTH_* relocations (without including RELR compressed AUTH
relocations) as described here:
https://github.com/ARM-software/abi-aa/blob/main/pauthabielf64/pauthabielf64.rst#auth-variant-dynamic-relocations

- .note.AARCH64-PAUTH-ABI-tag section as defined here
https://github.com/ARM-software/abi-aa/blob/main/pauthabielf64/pauthabielf64.rst#elf-marking

Depends on #72713 and #85231

---------

Co-authored-by: Peter Collingbourne <peter@pcc.me.uk>
Co-authored-by: Fangrui Song <i@maskray.me>
---
 lld/ELF/Arch/AArch64.cpp                 |   4 +-
 lld/ELF/Config.h                         |   3 +
 lld/ELF/Driver.cpp                       |  70 +++++++++++++++----
 lld/ELF/InputFiles.cpp                   |  41 +++++------
 lld/ELF/InputFiles.h                     |   1 +
 lld/ELF/InputSection.cpp                 |   1 +
 lld/ELF/Relocations.cpp                  |  28 ++++++--
 lld/ELF/Relocations.h                    |   1 +
 lld/ELF/SyntheticSections.cpp            |  40 ++++++++---
 lld/ELF/Writer.cpp                       |   3 +-
 lld/docs/ReleaseNotes.rst                |   3 +
 lld/docs/ld.lld.1                        |   5 ++
 lld/test/ELF/aarch64-bti-pac-cli-error.s |  15 ++--
 lld/test/ELF/aarch64-feature-pauth.s     | 114 +++++++++++++++++++++++++++++++
 lld/test/ELF/aarch64-reloc-pauth-ro.s    |  22 ++++++
 lld/test/ELF/aarch64-reloc-pauth.s       | 108 +++++++++++++++++++++++++++++
 16 files changed, 403 insertions(+), 56 deletions(-)
 create mode 100644 lld/test/ELF/aarch64-feature-pauth.s
 create mode 100644 lld/test/ELF/aarch64-reloc-pauth-ro.s
 create mode 100644 lld/test/ELF/aarch64-reloc-pauth.s

diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 30ccd68f7b75..017c17c2b03d 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -113,6 +113,8 @@ RelExpr AArch64::getRelExpr(RelType type, const Symbol &s,
   case R_AARCH64_MOVW_UABS_G2_NC:
   case R_AARCH64_MOVW_UABS_G3:
     return R_ABS;
+  case R_AARCH64_AUTH_ABS64:
+    return R_AARCH64_AUTH;
   case R_AARCH64_TLSDESC_ADR_PAGE21:
     return R_AARCH64_TLSDESC_PAGE;
   case R_AARCH64_TLSDESC_LD64_LO12:
@@ -204,7 +206,7 @@ bool AArch64::usesOnlyLowPageBits(RelType type) const {
 }
 
 RelType AArch64::getDynRel(RelType type) const {
-  if (type == R_AARCH64_ABS64)
+  if (type == R_AARCH64_ABS64 || type == R_AARCH64_AUTH_ABS64)
     return type;
   return R_AARCH64_NONE;
 }
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 27274d69b7f1..83f293ab2ce5 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -187,6 +187,7 @@ struct Config {
   llvm::StringRef cmseOutputLib;
   StringRef zBtiReport = "none";
   StringRef zCetReport = "none";
+  StringRef zPauthReport = "none";
   bool ltoBBAddrMap;
   llvm::StringRef ltoBasicBlockSections;
   std::pair<llvm::StringRef, llvm::StringRef> thinLTOObjectSuffixReplace;
@@ -499,6 +500,8 @@ struct Ctx {
   void reset();
 
   llvm::raw_fd_ostream openAuxiliaryFile(llvm::StringRef, std::error_code &);
+
+  ArrayRef<uint8_t> aarch64PauthAbiCoreInfo;
 };
 
 LLVM_LIBRARY_VISIBILITY extern Ctx ctx;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index b43da7727e22..8dbff7fb86e7 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -46,6 +46,7 @@
 #include "lld/Common/Strings.h"
 #include "lld/Common/TargetOptionsCommandFlags.h"
 #include "lld/Common/Version.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -461,6 +462,8 @@ static void checkOptions() {
       error("-z force-bti only supported on AArch64");
     if (config->zBtiReport != "none")
       error("-z bti-report only supported on AArch64");
+    if (config->zPauthReport != "none")
+      error("-z pauth-report only supported on AArch64");
   }
 
   if (config->emachine != EM_386 && config->emachine != EM_X86_64 &&
@@ -1501,7 +1504,8 @@ static void readConfigs(opt::InputArgList &args) {
   }
 
   auto reports = {std::make_pair("bti-report", &config->zBtiReport),
-                  std::make_pair("cet-report", &config->zCetReport)};
+                  std::make_pair("cet-report", &config->zCetReport),
+                  std::make_pair("pauth-report", &config->zPauthReport)};
   for (opt::Arg *arg : args.filtered(OPT_z)) {
     std::pair<StringRef, StringRef> option =
         StringRef(arg->getValue()).split('=');
@@ -2599,14 +2603,17 @@ static void redirectSymbols(ArrayRef<WrappedSymbol> wrapped) {
     symtab.wrap(w.sym, w.real, w.wrap);
 }
 
+static void reportMissingFeature(StringRef config, const Twine &report) {
+  if (config == "error")
+    error(report);
+  else if (config == "warning")
+    warn(report);
+}
+
 static void checkAndReportMissingFeature(StringRef config, uint32_t features,
                                          uint32_t mask, const Twine &report) {
-  if (!(features & mask)) {
-    if (config == "error")
-      error(report);
-    else if (config == "warning")
-      warn(report);
-  }
+  if (!(features & mask))
+    reportMissingFeature(config, report);
 }
 
 // To enable CET (x86's hardware-assisted control flow enforcement), each
@@ -2617,12 +2624,28 @@ static void checkAndReportMissingFeature(StringRef config, uint32_t features,
 //
 // This is also the case with AARCH64's BTI and PAC which use the similar
 // GNU_PROPERTY_AARCH64_FEATURE_1_AND mechanism.
-static uint32_t getAndFeatures() {
+//
+// For AArch64 PAuth-enabled object files, the core info of all of them must
+// match. Missing info for some object files with matching info for remaining
+// ones can be allowed (see -z pauth-report).
+static void readSecurityNotes() {
   if (config->emachine != EM_386 && config->emachine != EM_X86_64 &&
       config->emachine != EM_AARCH64)
-    return 0;
+    return;
+
+  config->andFeatures = -1;
+
+  StringRef referenceFileName;
+  if (config->emachine == EM_AARCH64) {
+    auto it = llvm::find_if(ctx.objectFiles, [](const ELFFileBase *f) {
+      return !f->aarch64PauthAbiCoreInfo.empty();
+    });
+    if (it != ctx.objectFiles.end()) {
+      ctx.aarch64PauthAbiCoreInfo = (*it)->aarch64PauthAbiCoreInfo;
+      referenceFileName = (*it)->getName();
+    }
+  }
 
-  uint32_t ret = -1;
   for (ELFFileBase *f : ctx.objectFiles) {
     uint32_t features = f->andFeatures;
 
@@ -2658,14 +2681,31 @@ static uint32_t getAndFeatures() {
                          "GNU_PROPERTY_AARCH64_FEATURE_1_PAC property");
       features |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
     }
-    ret &= features;
+    config->andFeatures &= features;
+
+    if (ctx.aarch64PauthAbiCoreInfo.empty())
+      continue;
+
+    if (f->aarch64PauthAbiCoreInfo.empty()) {
+      reportMissingFeature(config->zPauthReport,
+                           toString(f) +
+                               ": -z pauth-report: file does not have AArch64 "
+                               "PAuth core info while '" +
+                               referenceFileName + "' has one");
+      continue;
+    }
+
+    if (ctx.aarch64PauthAbiCoreInfo != f->aarch64PauthAbiCoreInfo)
+      errorOrWarn("incompatible values of AArch64 PAuth core info found\n>>> " +
+                  referenceFileName + ": 0x" +
+                  toHex(ctx.aarch64PauthAbiCoreInfo, /*LowerCase=*/true) +
+                  "\n>>> " + toString(f) + ": 0x" +
+                  toHex(f->aarch64PauthAbiCoreInfo, /*LowerCase=*/true));
   }
 
   // Force enable Shadow Stack.
   if (config->zShstk)
-    ret |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
-
-  return ret;
+    config->andFeatures |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
 }
 
 static void initSectionsAndLocalSyms(ELFFileBase *file, bool ignoreComdats) {
@@ -2944,7 +2984,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
 
   // Read .note.gnu.property sections from input object files which
   // contain a hint to tweak linker's and loader's behaviors.
-  config->andFeatures = getAndFeatures();
+  readSecurityNotes();
 
   // The Target instance handles target-specific stuff, such as applying
   // relocations or writing a PLT section. It also contains target-dependent
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 6529ea072fae..1f496026d3ae 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -926,25 +926,18 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
     handleSectionGroup<ELFT>(this->sections, entries);
 }
 
-// If a source file is compiled with x86 hardware-assisted call flow control
-// enabled, the generated object file contains feature flags indicating that
-// fact. This function reads the feature flags and returns it.
-//
-// Essentially we want to read a single 32-bit value in this function, but this
-// function is rather complicated because the value is buried deep inside a
-// .note.gnu.property section.
-//
-// The section consists of one or more NOTE records. Each NOTE record consists
-// of zero or more type-length-value fields. We want to find a field of a
-// certain type. It seems a bit too much to just store a 32-bit value, perhaps
-// the ABI is unnecessarily complicated.
-template <class ELFT> static uint32_t readAndFeatures(const InputSection &sec) {
+// Read the following info from the .note.gnu.property section and write it to
+// the corresponding fields in `ObjFile`:
+// - Feature flags (32 bits) representing x86 or AArch64 features for
+//   hardware-assisted call flow control;
+// - AArch64 PAuth ABI core info (16 bytes).
+template <class ELFT>
+void readGnuProperty(const InputSection &sec, ObjFile<ELFT> &f) {
   using Elf_Nhdr = typename ELFT::Nhdr;
   using Elf_Note = typename ELFT::Note;
 
-  uint32_t featuresSet = 0;
   ArrayRef<uint8_t> data = sec.content();
-  auto reportFatal = [&](const uint8_t *place, const char *msg) {
+  auto reportFatal = [&](const uint8_t *place, const Twine &msg) {
     fatal(toString(sec.file) + ":(" + sec.name + "+0x" +
           Twine::utohexstr(place - sec.content().data()) + "): " + msg);
   };
@@ -983,7 +976,19 @@ template <class ELFT> static uint32_t readAndFeatures(const InputSection &sec) {
         // accumulate the bits set.
         if (size < 4)
           reportFatal(place, "FEATURE_1_AND entry is too short");
-        featuresSet |= read32<ELFT::Endianness>(desc.data());
+        f.andFeatures |= read32<ELFT::Endianness>(desc.data());
+      } else if (config->emachine == EM_AARCH64 &&
+                 type == GNU_PROPERTY_AARCH64_FEATURE_PAUTH) {
+        if (!f.aarch64PauthAbiCoreInfo.empty()) {
+          reportFatal(data.data(),
+                      "multiple GNU_PROPERTY_AARCH64_FEATURE_PAUTH entries are "
+                      "not supported");
+        } else if (size != 16) {
+          reportFatal(data.data(), "GNU_PROPERTY_AARCH64_FEATURE_PAUTH entry "
+                                   "is invalid: expected 16 bytes, but got " +
+                                       Twine(size));
+        }
+        f.aarch64PauthAbiCoreInfo = desc;
       }
 
       // Padding is present in the note descriptor, if necessary.
@@ -993,8 +998,6 @@ template <class ELFT> static uint32_t readAndFeatures(const InputSection &sec) {
     // Go to next NOTE record to look for more FEATURE_1_AND descriptions.
     data = data.slice(nhdr->getSize(sec.addralign));
   }
-
-  return featuresSet;
 }
 
 template <class ELFT>
@@ -1051,7 +1054,7 @@ InputSectionBase *ObjFile<ELFT>::createInputSection(uint32_t idx,
     // .note.gnu.property containing a single AND'ed bitmap, we discard an input
     // file's .note.gnu.property section.
     if (name == ".note.gnu.property") {
-      this->andFeatures = readAndFeatures<ELFT>(InputSection(*this, sec, name));
+      readGnuProperty<ELFT>(InputSection(*this, sec, name), *this);
       return &InputSection::discarded;
     }
 
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 95197599a2e1..834b3b63dd83 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -230,6 +230,7 @@ protected:
 public:
   uint32_t andFeatures = 0;
   bool hasCommonSyms = false;
+  ArrayRef<uint8_t> aarch64PauthAbiCoreInfo;
 };
 
 // .o file.
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 4f88313b868b..c06816bcfd56 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -676,6 +676,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
   case R_DTPREL:
   case R_RELAX_TLS_LD_TO_LE_ABS:
   case R_RELAX_GOT_PC_NOPIC:
+  case R_AARCH64_AUTH:
   case R_RISCV_ADD:
   case R_RISCV_LEB128:
     return sym.getVA(a);
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 92f2e200db11..55274344f881 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -995,7 +995,8 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
   if (e == R_GOT || e == R_PLT)
     return target->usesOnlyLowPageBits(type) || !config->isPic;
 
-  if (sym.isPreemptible)
+  // R_AARCH64_AUTH_ABS64 requires a dynamic relocation.
+  if (sym.isPreemptible || e == R_AARCH64_AUTH)
     return false;
   if (!config->isPic)
     return true;
@@ -1141,12 +1142,26 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
         (rel == target->symbolicRel && !sym.isPreemptible)) {
       addRelativeReloc<true>(*sec, offset, sym, addend, expr, type);
       return;
-    } else if (rel != 0) {
+    }
+    if (rel != 0) {
       if (config->emachine == EM_MIPS && rel == target->symbolicRel)
         rel = target->relativeRel;
       std::lock_guard<std::mutex> lock(relocMutex);
-      sec->getPartition().relaDyn->addSymbolReloc(rel, *sec, offset, sym,
-                                                  addend, type);
+      Partition &part = sec->getPartition();
+      if (config->emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64) {
+        // For a preemptible symbol, we can't use a relative relocation. For an
+        // undefined symbol, we can't compute offset at link-time and use a
+        // relative relocation. Use a symbolic relocation instead.
+        if (sym.isPreemptible) {
+          part.relaDyn->addSymbolReloc(type, *sec, offset, sym, addend, type);
+        } else {
+          part.relaDyn->addReloc({R_AARCH64_AUTH_RELATIVE, sec, offset,
+                                  DynamicReloc::AddendOnlyWithTargetVA, sym,
+                                  addend, R_ABS});
+        }
+        return;
+      }
+      part.relaDyn->addSymbolReloc(rel, *sec, offset, sym, addend, type);
 
       // MIPS ABI turns using of GOT and dynamic relocations inside out.
       // While regular ABI uses dynamic relocations to fill up GOT entries
@@ -1171,7 +1186,10 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
 
   // When producing an executable, we can perform copy relocations (for
   // STT_OBJECT) and canonical PLT (for STT_FUNC) if sym is defined by a DSO.
-  if (!config->shared && sym.isShared()) {
+  // Copy relocations/canonical PLT entries are unsupported for
+  // R_AARCH64_AUTH_ABS64.
+  if (!config->shared && sym.isShared() &&
+      !(config->emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64)) {
     if (!canDefineSymbolInExecutable(sym)) {
       errorOrWarn("cannot preempt symbol: " + toString(sym) +
                   getLocation(*sec, sym, offset));
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index 7eb8a811e693..b7b9c09e1b89 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -87,6 +87,7 @@ enum RelExpr {
   R_AARCH64_PAGE_PC,
   R_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC,
   R_AARCH64_TLSDESC_PAGE,
+  R_AARCH64_AUTH,
   R_ARM_PCA,
   R_ARM_SBREL,
   R_MIPS_GOTREL,
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index d4dc713b4e27..4427a1200868 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -314,22 +314,42 @@ GnuPropertySection::GnuPropertySection()
                        config->wordsize, ".note.gnu.property") {}
 
 void GnuPropertySection::writeTo(uint8_t *buf) {
+  write32(buf, 4);                          // Name size
+  write32(buf + 4, getSize() - 16);         // Content size
+  write32(buf + 8, NT_GNU_PROPERTY_TYPE_0); // Type
+  memcpy(buf + 12, "GNU", 4);               // Name string
+
   uint32_t featureAndType = config->emachine == EM_AARCH64
                                 ? GNU_PROPERTY_AARCH64_FEATURE_1_AND
                                 : GNU_PROPERTY_X86_FEATURE_1_AND;
 
-  write32(buf, 4);                                   // Name size
-  write32(buf + 4, config->is64 ? 16 : 12);          // Content size
-  write32(buf + 8, NT_GNU_PROPERTY_TYPE_0);          // Type
-  memcpy(buf + 12, "GNU", 4);                        // Name string
-  write32(buf + 16, featureAndType);                 // Feature type
-  write32(buf + 20, 4);                              // Feature size
-  write32(buf + 24, config->andFeatures);            // Feature flags
-  if (config->is64)
-    write32(buf + 28, 0); // Padding
+  unsigned offset = 16;
+  if (config->andFeatures != 0) {
+    write32(buf + offset + 0, featureAndType);      // Feature type
+    write32(buf + offset + 4, 4);                   // Feature size
+    write32(buf + offset + 8, config->andFeatures); // Feature flags
+    if (config->is64)
+      write32(buf + offset + 12, 0); // Padding
+    offset += 16;
+  }
+
+  if (!ctx.aarch64PauthAbiCoreInfo.empty()) {
+    write32(buf + offset + 0, GNU_PROPERTY_AARCH64_FEATURE_PAUTH);
+    write32(buf + offset + 4, ctx.aarch64PauthAbiCoreInfo.size());
+    memcpy(buf + offset + 8, ctx.aarch64PauthAbiCoreInfo.data(),
+           ctx.aarch64PauthAbiCoreInfo.size());
+  }
 }
 
-size_t GnuPropertySection::getSize() const { return config->is64 ? 32 : 28; }
+size_t GnuPropertySection::getSize() const {
+  uint32_t contentSize = 0;
+  if (config->andFeatures != 0)
+    contentSize += config->is64 ? 16 : 12;
+  if (!ctx.aarch64PauthAbiCoreInfo.empty())
+    contentSize += 4 + 4 + ctx.aarch64PauthAbiCoreInfo.size();
+  assert(contentSize != 0);
+  return contentSize + 16;
+}
 
 BuildIdSection::BuildIdSection()
     : SyntheticSection(SHF_ALLOC, SHT_NOTE, 4, ".note.gnu.build-id"),
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 40d617b7fdf3..fc9084f40044 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -24,6 +24,7 @@
 #include "lld/Common/CommonLinkerContext.h"
 #include "lld/Common/Filesystem.h"
 #include "lld/Common/Strings.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/BLAKE3.h"
 #include "llvm/Support/Parallel.h"
@@ -564,7 +565,7 @@ template <class ELFT> void elf::createSyntheticSections() {
   in.iplt = std::make_unique<IpltSection>();
   add(*in.iplt);
 
-  if (config->andFeatures)
+  if (config->andFeatures || !ctx.aarch64PauthAbiCoreInfo.empty())
     add(*make<GnuPropertySection>());
 
   // .note.GNU-stack is always added when we are creating a re-linkable
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 97ed06048910..bf0c8e55d103 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -29,6 +29,9 @@ ELF Improvements
 * ``--compress-sections <section-glib>=[none|zlib|zstd]`` is added to compress
   matched output sections without the ``SHF_ALLOC`` flag.
   (`#84855 <https://github.com/llvm/llvm-project/pull/84855>`_)
+* ``GNU_PROPERTY_AARCH64_FEATURE_PAUTH`` notes, ``R_AARCH64_AUTH_ABS64`` and
+  ``R_AARCH64_AUTH_RELATIVE`` relocations are now supported.
+  (`#72714 <https://github.com/llvm/llvm-project/pull/72714>`_)
 
 Breaking changes
 ----------------
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 65e50e349c8c..e0316730f442 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -762,6 +762,11 @@ Specify how to report the missing GNU_PROPERTY_X86_FEATURE_1_IBT or GNU_PROPERTY
 .Cm none
 is the default, linker will not report the missing property otherwise will be reported as a warning or an error.
 .Pp
+.It Cm pauth-report Ns = Ns Ar [none|warning|error]
+Specify how to report the missing GNU_PROPERTY_AARCH64_FEATURE_PAUTH property.
+.Cm none
+is the default, linker will not report the missing property otherwise will be reported as a warning or an error.
+.Pp
 .It Cm force-bti
 Force enable AArch64 BTI instruction in PLT, warn if Input ELF file does not have GNU_PROPERTY_AARCH64_FEATURE_1_BTI property.
 .Pp
diff --git a/lld/test/ELF/aarch64-bti-pac-cli-error.s b/lld/test/ELF/aarch64-bti-pac-cli-error.s
index b8ab1a28fa5a..703c0aac6ea5 100644
--- a/lld/test/ELF/aarch64-bti-pac-cli-error.s
+++ b/lld/test/ELF/aarch64-bti-pac-cli-error.s
@@ -1,17 +1,22 @@
 # REQUIRES: x86
 # RUN: llvm-mc --triple=x86_64-pc-linux --filetype=obj -o %t.o %s
-# RUN: not ld.lld -z pac-plt -z force-bti -z bti-report=error %t.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: not ld.lld -z pac-plt -z force-bti -z bti-report=error   \
+# RUN:     -z pauth-report=error   %t.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: not ld.lld -z pac-plt -z force-bti -z bti-report=warning \
+# RUN:     -z pauth-report=warning %t.o -o /dev/null 2>&1 | FileCheck %s
 #
-## Check that we error if -z pac-plt, -z force-bti and -z bti-report=error are used when target is not
-## aarch64
+## Check that we error if -z pac-plt, -z force-bti are present and
+## -z bti-report and -z pauth-report are not none when target is not aarch64
 
 # CHECK: error: -z pac-plt only supported on AArch64
 # CHECK-NEXT: error: -z force-bti only supported on AArch64
 # CHECK-NEXT: error: -z bti-report only supported on AArch64
+# CHECK-NEXT: error: -z pauth-report only supported on AArch64
 
-# RUN: not ld.lld -z bti-report=something %t.o -o /dev/null 2>&1 | \
-# RUN:     FileCheck --check-prefix=REPORT_INVALID %s
+# RUN: not ld.lld -z bti-report=something -z pauth-report=something \
+# RUN:     %t.o -o /dev/null 2>&1 | FileCheck --check-prefix=REPORT_INVALID %s
 # REPORT_INVALID: error: -z bti-report= parameter something is not recognized
+# REPORT_INVALID: error: -z pauth-report= parameter something is not recognized
 # REPORT_INVALID-EMPTY:
 
         .globl start
diff --git a/lld/test/ELF/aarch64-feature-pauth.s b/lld/test/ELF/aarch64-feature-pauth.s
new file mode 100644
index 000000000000..699a650d7229
--- /dev/null
+++ b/lld/test/ELF/aarch64-feature-pauth.s
@@ -0,0 +1,114 @@
+# REQUIRES: aarch64
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag1.s -o tag1.o
+# RUN: cp tag1.o tag1a.o
+# RUN: ld.lld -shared tag1.o tag1a.o -o tagok.so
+# RUN: llvm-readelf -n tagok.so | FileCheck --check-prefix OK %s
+
+# OK: AArch64 PAuth ABI core info: platform 0x2a (unknown), version 0x1
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag2.s -o tag2.o
+# RUN: not ld.lld tag1.o tag1a.o tag2.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR1 %s
+
+# ERR1:      error: incompatible values of AArch64 PAuth core info found
+# ERR1-NEXT: >>> tag1.o: 0x2a000000000000000{{1|2}}00000000000000
+# ERR1-NEXT: >>> tag2.o: 0x2a000000000000000{{1|2}}00000000000000
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-short.s -o short.o
+# RUN: not ld.lld short.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR2 %s
+
+# ERR2: error: short.o:(.note.gnu.property+0x0): GNU_PROPERTY_AARCH64_FEATURE_PAUTH entry is invalid: expected 16 bytes, but got 12
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-long.s -o long.o
+# RUN: not ld.lld long.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR3 %s
+
+# ERR3: error: long.o:(.note.gnu.property+0x0): GNU_PROPERTY_AARCH64_FEATURE_PAUTH entry is invalid: expected 16 bytes, but got 24
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-multiple.s -o multiple.o
+# RUN: not ld.lld multiple.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR4 %s
+# ERR4: error: multiple.o:(.note.gnu.property+0x0): multiple GNU_PROPERTY_AARCH64_FEATURE_PAUTH entries are not supported
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu no-info.s -o noinfo1.o
+# RUN: cp noinfo1.o noinfo2.o
+# RUN: not ld.lld -z pauth-report=error noinfo1.o tag1.o noinfo2.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR5 %s
+# RUN: ld.lld -z pauth-report=warning noinfo1.o tag1.o noinfo2.o -o /dev/null 2>&1 | FileCheck --check-prefix WARN %s
+# RUN: ld.lld -z pauth-report=none noinfo1.o tag1.o noinfo2.o --fatal-warnings -o /dev/null
+
+# ERR5:      error: noinfo1.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one
+# ERR5-NEXT: error: noinfo2.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one
+# WARN:      warning: noinfo1.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one
+# WARN-NEXT: warning: noinfo2.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one
+
+#--- abi-tag-short.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 20
+.long 5
+.asciz "GNU"
+.long 0xc0000001
+.long 12
+.quad 2
+.long 31
+
+#--- abi-tag-long.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 32
+.long 5
+.asciz "GNU"
+.long 0xc0000001
+.long 24
+.quad 2
+.quad 31
+.quad 0
+
+#--- abi-tag-multiple.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 48
+.long 5
+.asciz "GNU"
+.long 0xc0000001
+.long 16
+.quad 42 // platform
+.quad 1  // version
+.long 0xc0000001
+.long 16
+.quad 42 // platform
+.quad 1  // version
+
+#--- abi-tag1.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 24
+.long 5
+.asciz "GNU"
+.long 0xc0000001
+.long 16
+.quad 42 // platform
+.quad 1  // version
+
+#--- abi-tag2.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 24
+.long 5
+.asciz "GNU"
+.long 0xc0000001
+.long 16
+.quad 42 // platform
+.quad 2  // version
+
+#--- no-info.s
+
+## define _start to avoid missing entry warning and use --fatal-warnings to assert no diagnostic
+## allow multiple definitions of _start for simplicity
+.weak _start;
+_start:
diff --git a/lld/test/ELF/aarch64-reloc-pauth-ro.s b/lld/test/ELF/aarch64-reloc-pauth-ro.s
new file mode 100644
index 000000000000..1be78ba4666e
--- /dev/null
+++ b/lld/test/ELF/aarch64-reloc-pauth-ro.s
@@ -0,0 +1,22 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %p/Inputs/shared2.s -o %t.so.o
+# RUN: ld.lld -shared %t.so.o -soname=so -o %t.so
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o
+# RUN: not ld.lld -pie %t.o %t.so -o %t2 2>&1 | FileCheck -DFILE=%t %s --implicit-check-not=error:
+
+# CHECK:      error: relocation R_AARCH64_AUTH_ABS64 cannot be used against symbol 'zed2'; recompile with -fPIC
+# CHECK-NEXT: >>> defined in [[FILE]].so
+# CHECK-NEXT: >>> referenced by [[FILE]].o:(.ro+0x0)
+
+# CHECK:      error: relocation R_AARCH64_AUTH_ABS64 cannot be used against symbol 'bar2'; recompile with -fPIC
+# CHECK:      error: relocation R_AARCH64_AUTH_ABS64 cannot be used against local symbol; recompile with -fPIC
+
+foo:
+.type foo, @function
+
+.section .ro, "a"
+.p2align 3
+.quad zed2@AUTH(da,42)
+.quad bar2@AUTH(ia,42)
+.quad foo@AUTH(ia,42)
diff --git a/lld/test/ELF/aarch64-reloc-pauth.s b/lld/test/ELF/aarch64-reloc-pauth.s
new file mode 100644
index 000000000000..b603d8ffdcab
--- /dev/null
+++ b/lld/test/ELF/aarch64-reloc-pauth.s
@@ -0,0 +1,108 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %p/Inputs/shared2.s -o %t.a.o
+# RUN: ld.lld -shared %t.a.o -soname=so -o %t.a.so
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o
+
+# RUN: ld.lld -pie %t.o %t.a.so -o %t
+# RUN: llvm-readobj -r %t | FileCheck --check-prefix=UNPACKED %s
+
+# UNPACKED:          Section ({{.+}}) .rela.dyn {
+# UNPACKED-NEXT:       0x30470 R_AARCH64_AUTH_RELATIVE - 0x1
+# UNPACKED-NEXT:       0x30478 R_AARCH64_AUTH_RELATIVE - 0x30472
+# UNPACKED-NEXT:       0x30480 R_AARCH64_AUTH_RELATIVE - 0xFFFFFFFFFFFFFFFD
+# UNPACKED-NEXT:       0x30488 R_AARCH64_AUTH_RELATIVE - 0x12345678
+# UNPACKED-NEXT:       0x30490 R_AARCH64_AUTH_RELATIVE - 0x123456789A
+# UNPACKED-NEXT:       0x30498 R_AARCH64_AUTH_RELATIVE - 0xFFFFFFEDCBA98766
+# UNPACKED-NEXT:       0x304A0 R_AARCH64_AUTH_RELATIVE - 0x8003046F
+# UNPACKED-NEXT:       0x304B9 R_AARCH64_AUTH_RELATIVE - 0x4
+# UNPACKED-NEXT:       0x304C2 R_AARCH64_AUTH_RELATIVE - 0x30475
+# UNPACKED-NEXT:       0x304A8 R_AARCH64_AUTH_ABS64 zed2 0x1111
+# UNPACKED-NEXT:       0x304B0 R_AARCH64_AUTH_ABS64 bar2 0x0
+# UNPACKED-NEXT:     }
+
+# RUN: ld.lld %t.o %t.a.so -o %t.nopie
+# RUN: llvm-readobj -r %t.nopie | FileCheck --check-prefix=NOPIE %s
+
+# NOPIE:      Section ({{.+}}) .rela.dyn {
+# NOPIE:        0x230460 R_AARCH64_AUTH_RELATIVE - 0x200001
+# NOPIE-NEXT:   0x230468 R_AARCH64_AUTH_RELATIVE - 0x230462
+# NOPIE-NEXT:   0x230470 R_AARCH64_AUTH_RELATIVE - 0x1FFFFD
+# NOPIE-NEXT:   0x230478 R_AARCH64_AUTH_RELATIVE - 0x12545678
+# NOPIE-NEXT:   0x230480 R_AARCH64_AUTH_RELATIVE - 0x123476789A
+# NOPIE-NEXT:   0x230488 R_AARCH64_AUTH_RELATIVE - 0xFFFFFFEDCBC98766
+# NOPIE-NEXT:   0x230490 R_AARCH64_AUTH_RELATIVE - 0x8023045F
+# NOPIE-NEXT:   0x2304A9 R_AARCH64_AUTH_RELATIVE - 0x200004
+# NOPIE-NEXT:   0x2304B2 R_AARCH64_AUTH_RELATIVE - 0x230465
+# NOPIE-NEXT:   0x230498 R_AARCH64_AUTH_ABS64 zed2 0x1111
+# NOPIE-NEXT:   0x2304A0 R_AARCH64_AUTH_ABS64 bar2 0x0
+# NOPIE-NEXT: }
+
+# RUN: ld.lld -pie %t.o %t.a.so -o %t.pie
+# RUN: llvm-readelf -S -d -r -x .test %t.pie | FileCheck --check-prefixes=PIE,HEX %s
+
+# PIE:      Section Headers:
+# PIE-NEXT: Name Type Address Off Size ES Flg Lk Inf Al
+# PIE:      .rela.dyn RELA {{0*}}[[#%x,ADDR1:]]
+# PIE-SAME:                                     {{0*}}[[#ADDR1]] 000108 18 A 1 0 8
+
+# PIE:      Relocation section '.rela.dyn' at offset 0x[[#ADDR1]] contains 11 entries:
+# PIE-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+# PIE-NEXT: 0000000000030470  0000000000000411 R_AARCH64_AUTH_RELATIVE 1
+# PIE-NEXT: 0000000000030478  0000000000000411 R_AARCH64_AUTH_RELATIVE 30472
+# PIE-NEXT: 0000000000030480  0000000000000411 R_AARCH64_AUTH_RELATIVE fffffffffffffffd
+# PIE-NEXT: 0000000000030488  0000000000000411 R_AARCH64_AUTH_RELATIVE 12345678
+# PIE-NEXT: 0000000000030490  0000000000000411 R_AARCH64_AUTH_RELATIVE 123456789a
+# PIE-NEXT: 0000000000030498  0000000000000411 R_AARCH64_AUTH_RELATIVE ffffffedcba98766
+# PIE-NEXT: 00000000000304a0  0000000000000411 R_AARCH64_AUTH_RELATIVE 8003046f
+# PIE-NEXT: 00000000000304b9  0000000000000411 R_AARCH64_AUTH_RELATIVE 4
+# PIE-NEXT: 00000000000304c2  0000000000000411 R_AARCH64_AUTH_RELATIVE 30475
+# PIE-NEXT: 00000000000304a8  0000000100000244 R_AARCH64_AUTH_ABS64   0000000000000000 zed2 + 1111
+# PIE-NEXT: 00000000000304b0  0000000200000244 R_AARCH64_AUTH_ABS64   0000000000000000 bar2 + 0
+
+# HEX:      Hex dump of section '.test':
+# HEX-NEXT: 0x00030470 00000000 2a000020 00000000 2b000000
+##                              ^^^^ Discr = 42
+##                                    ^^ Key (bits 5..6) = DA
+##                                                ^^^^ Discr = 43
+##                                                      ^^ Key (bits 5..6) = IA
+# HEX-NEXT: 0x00030480 00000000 2c000080 00000000 2d000020
+##                              ^^^^ Discr = 44
+##                                    ^^ Key (bits 5..6) = IA
+##                                    ^^ Addr diversity (bit 7) = true
+##                                                ^^^^ Discr = 45
+##                                                      ^^ Key (bits 5..6) = DA
+# HEX-NEXT: 0x00030490 00000000 2e000020 00000000 2f000020
+##                              ^^^^ Discr = 46
+##                                    ^^ Key (bits 5..6) = DA
+##                                                ^^^^ Discr = 47
+##                                                      ^^ Key (bits 5..6) = DA
+# HEX-NEXT: 0x000304a0 00000000 30000020 00000000 31000020
+##                              ^^^^ Discr = 48
+##                                    ^^ Key (bits 5..6) = DA
+##                                                ^^^^ Discr = 49
+##                                                      ^^ Key (bits 5..6) = DA
+# HEX-NEXT: 0x000304b0 00000000 32000000 77000000 00330000
+##                              ^^^^ Discr = 50
+##                                    ^^ Key (bits 5..6) = IA
+##                                                  ^^^^ Discr = 51
+# HEX-NEXT: 0x000304c0 20770000 00003400 0020{{\ }}
+##                     ^^ Key (bits 5..6) = DA
+##                                  ^^^^ Discr = 52
+##                                         ^^ Key (bits 5..6) = DA
+
+.section .test, "aw"
+.p2align 3
+.quad (__ehdr_start + 1)@AUTH(da,42)
+.quad (.test + 2)@AUTH(ia,43)
+.quad (__ehdr_start - 3)@AUTH(ia,44,addr)
+.quad (__ehdr_start + 0x12345678)@AUTH(da,45)
+.quad (__ehdr_start + 0x123456789A)@AUTH(da,46)
+.quad (__ehdr_start - 0x123456789A)@AUTH(da,47)
+.quad (.test + 0x7FFFFFFF)@AUTH(da,48)
+.quad (zed2 + 0x1111)@AUTH(da,49)
+.quad bar2@AUTH(ia,50)
+.byte 0x77
+.quad (__ehdr_start + 4)@AUTH(da,51)
+.byte 0x77
+.quad (.test + 5)@AUTH(da,52)
-- 
cgit v1.2.3


From 71c3f5d617aa132418e87403c8be3cdcd102ab18 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 4 Apr 2024 11:41:27 +0200
Subject: [reland][libc] Refactor `BigInt` (#87613)

This is a reland of #86137 with a fix for platforms / compiler that do
not support trivially constructible int128 types.
---
 libc/fuzzing/CMakeLists.txt                        |    1 +
 libc/fuzzing/__support/CMakeLists.txt              |    7 +
 libc/fuzzing/__support/uint_fuzz.cpp               |   70 ++
 libc/src/__support/FPUtil/dyadic_float.h           |    6 +-
 libc/src/__support/UInt.h                          | 1129 ++++++++++----------
 libc/src/__support/float_to_string.h               |    7 +-
 libc/src/__support/integer_literals.h              |   25 +-
 libc/src/__support/math_extras.h                   |  249 ++---
 libc/src/__support/number_pair.h                   |   11 -
 libc/test/src/__support/integer_literals_test.cpp  |   21 +
 libc/test/src/__support/math_extras_test.cpp       |   57 +
 libc/test/src/__support/uint_test.cpp              |  192 +++-
 .../libc/test/src/__support/BUILD.bazel            |    1 +
 13 files changed, 1014 insertions(+), 762 deletions(-)
 create mode 100644 libc/fuzzing/__support/CMakeLists.txt
 create mode 100644 libc/fuzzing/__support/uint_fuzz.cpp

diff --git a/libc/fuzzing/CMakeLists.txt b/libc/fuzzing/CMakeLists.txt
index 82487688af11..816691b4bd44 100644
--- a/libc/fuzzing/CMakeLists.txt
+++ b/libc/fuzzing/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer")
 add_custom_target(libc-fuzzer)
 
+add_subdirectory(__support)
 # TODO(#85680): Re-enable math fuzzing after headers are sorted out
 # add_subdirectory(math)
 add_subdirectory(stdlib)
diff --git a/libc/fuzzing/__support/CMakeLists.txt b/libc/fuzzing/__support/CMakeLists.txt
new file mode 100644
index 000000000000..278e914e3fbe
--- /dev/null
+++ b/libc/fuzzing/__support/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_libc_fuzzer(
+  uint_fuzz
+  SRCS
+    uint_fuzz.cpp
+  DEPENDS
+    libc.src.__support.uint
+)
diff --git a/libc/fuzzing/__support/uint_fuzz.cpp b/libc/fuzzing/__support/uint_fuzz.cpp
new file mode 100644
index 000000000000..f48f00d3b4ba
--- /dev/null
+++ b/libc/fuzzing/__support/uint_fuzz.cpp
@@ -0,0 +1,70 @@
+#include "src/__support/CPP/bit.h"
+#include "src/__support/UInt.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Helper function when using gdb / lldb to set a breakpoint and inspect values.
+template <typename T> void debug_and_trap(const char *msg, T a, T b) {
+  __builtin_trap();
+}
+
+#define DEBUG_AND_TRAP()
+
+#define TEST_BINOP(OP)                                                         \
+  if ((a OP b) != (static_cast<T>(BigInt(a) OP BigInt(b))))                    \
+    debug_and_trap(#OP, a, b);
+
+#define TEST_SHIFTOP(OP)                                                       \
+  if ((a OP b) != (static_cast<T>(BigInt(a) OP b)))                            \
+    debug_and_trap(#OP, a, b);
+
+#define TEST_FUNCTION(FUN)                                                     \
+  if (FUN(a) != FUN(BigInt(a)))                                                \
+    debug_and_trap(#FUN, a, b);
+
+// Test that basic arithmetic operations of BigInt behave like their scalar
+// counterparts.
+template <typename T, typename BigInt> void run_tests(T a, T b) {
+  TEST_BINOP(+)
+  TEST_BINOP(-)
+  TEST_BINOP(*)
+  if (b != 0)
+    TEST_BINOP(/)
+  if (b >= 0 && b < cpp::numeric_limits<T>::digits) {
+    TEST_SHIFTOP(<<)
+    TEST_SHIFTOP(>>)
+  }
+  if constexpr (!BigInt::SIGNED) {
+    TEST_FUNCTION(cpp::has_single_bit)
+    TEST_FUNCTION(cpp::countr_zero)
+    TEST_FUNCTION(cpp::countl_zero)
+    TEST_FUNCTION(cpp::countl_one)
+    TEST_FUNCTION(cpp::countr_one)
+  }
+}
+
+// Reads a T from libfuzzer data.
+template <typename T> T read(const uint8_t *data, size_t &remainder) {
+  T out = 0;
+  constexpr size_t T_SIZE = sizeof(T);
+  const size_t copy_size = remainder < T_SIZE ? remainder : T_SIZE;
+  inline_memcpy(&out, data, copy_size);
+  remainder -= copy_size;
+  return out;
+}
+
+template <typename T, typename BigInt>
+void run_tests(const uint8_t *data, size_t size) {
+  const auto a = read<T>(data, size);
+  const auto b = read<T>(data, size);
+  run_tests<T, BigInt>(a, b);
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  // unsigned
+  run_tests<uint64_t, BigInt<64, false, uint16_t>>(data, size);
+  // signed
+  run_tests<int64_t, BigInt<64, true, uint16_t>>(data, size);
+  return 0;
+}
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 73fd7381c3c8..e0c205f52383 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -58,9 +58,9 @@ template <size_t Bits> struct DyadicFloat {
   // significant bit.
   LIBC_INLINE constexpr DyadicFloat &normalize() {
     if (!mantissa.is_zero()) {
-      int shift_length = static_cast<int>(mantissa.clz());
+      int shift_length = cpp::countl_zero(mantissa);
       exponent -= shift_length;
-      mantissa.shift_left(static_cast<size_t>(shift_length));
+      mantissa <<= static_cast<size_t>(shift_length);
     }
     return *this;
   }
@@ -233,7 +233,7 @@ LIBC_INLINE constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
     result.sign = a.sign;
     result.exponent = a.exponent;
     result.mantissa = a.mantissa;
-    if (result.mantissa.add(b.mantissa)) {
+    if (result.mantissa.add_overflow(b.mantissa)) {
       // Mantissa addition overflow.
       result.shift_right(1);
       result.mantissa.val[DyadicFloat<Bits>::MantissaType::WORD_COUNT - 1] |=
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index 282efdba1c5f..c1e55ceef211 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -14,10 +14,11 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/macros/attributes.h"   // LIBC_INLINE
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/attributes.h"          // LIBC_INLINE
+#include "src/__support/macros/optimization.h"        // LIBC_UNLIKELY
+#include "src/__support/macros/properties/compiler.h" // LIBC_COMPILER_IS_CLANG
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128, LIBC_TYPES_HAS_INT64
-#include "src/__support/math_extras.h" // SumCarry, DiffBorrow
+#include "src/__support/math_extras.h" // add_with_carry, sub_with_borrow
 #include "src/__support/number_pair.h"
 
 #include <stddef.h> // For size_t
@@ -25,71 +26,324 @@
 
 namespace LIBC_NAMESPACE {
 
-namespace internal {
-template <typename T> struct half_width;
+namespace multiword {
 
-template <> struct half_width<uint64_t> : cpp::type_identity<uint32_t> {};
-template <> struct half_width<uint32_t> : cpp::type_identity<uint16_t> {};
+// A type trait mapping unsigned integers to their half-width unsigned
+// counterparts.
+template <typename T> struct half_width;
 template <> struct half_width<uint16_t> : cpp::type_identity<uint8_t> {};
+template <> struct half_width<uint32_t> : cpp::type_identity<uint16_t> {};
+#ifdef LIBC_TYPES_HAS_INT64
+template <> struct half_width<uint64_t> : cpp::type_identity<uint32_t> {};
 #ifdef LIBC_TYPES_HAS_INT128
 template <> struct half_width<__uint128_t> : cpp::type_identity<uint64_t> {};
 #endif // LIBC_TYPES_HAS_INT128
-
+#endif // LIBC_TYPES_HAS_INT64
 template <typename T> using half_width_t = typename half_width<T>::type;
 
-template <typename T> constexpr NumberPair<T> full_mul(T a, T b) {
-  NumberPair<T> pa = split(a);
-  NumberPair<T> pb = split(b);
-  NumberPair<T> prod;
+// An array of two elements that can be used in multiword operations.
+template <typename T> struct DoubleWide final : cpp::array<T, 2> {
+  using UP = cpp::array<T, 2>;
+  using UP::UP;
+  LIBC_INLINE constexpr DoubleWide(T lo, T hi) : UP({lo, hi}) {}
+};
+
+// Converts an unsigned value into a DoubleWide<half_width_t<T>>.
+template <typename T> LIBC_INLINE constexpr auto split(T value) {
+  static_assert(cpp::is_unsigned_v<T>);
+  using half_type = half_width_t<T>;
+  return DoubleWide<half_type>(
+      half_type(value),
+      half_type(value >> cpp::numeric_limits<half_type>::digits));
+}
+
+// The low part of a DoubleWide value.
+template <typename T> LIBC_INLINE constexpr T lo(const DoubleWide<T> &value) {
+  return value[0];
+}
+// The high part of a DoubleWide value.
+template <typename T> LIBC_INLINE constexpr T hi(const DoubleWide<T> &value) {
+  return value[1];
+}
+// The low part of an unsigned value.
+template <typename T> LIBC_INLINE constexpr half_width_t<T> lo(T value) {
+  return lo(split(value));
+}
+// The high part of an unsigned value.
+template <typename T> LIBC_INLINE constexpr half_width_t<T> hi(T value) {
+  return hi(split(value));
+}
+
+// Returns 'a' times 'b' in a DoubleWide<word>. Cannot overflow by construction.
+template <typename word>
+LIBC_INLINE constexpr DoubleWide<word> mul2(word a, word b) {
+  if constexpr (cpp::is_same_v<word, uint8_t>) {
+    return split<uint16_t>(uint16_t(a) * uint16_t(b));
+  } else if constexpr (cpp::is_same_v<word, uint16_t>) {
+    return split<uint32_t>(uint32_t(a) * uint32_t(b));
+  }
+#ifdef LIBC_TYPES_HAS_INT64
+  else if constexpr (cpp::is_same_v<word, uint32_t>) {
+    return split<uint64_t>(uint64_t(a) * uint64_t(b));
+  }
+#endif
+#ifdef LIBC_TYPES_HAS_INT128
+  else if constexpr (cpp::is_same_v<word, uint64_t>) {
+    return split<__uint128_t>(__uint128_t(a) * __uint128_t(b));
+  }
+#endif
+  else {
+    using half_word = half_width_t<word>;
+    const auto shiftl = [](word value) -> word {
+      return value << cpp::numeric_limits<half_word>::digits;
+    };
+    const auto shiftr = [](word value) -> word {
+      return value >> cpp::numeric_limits<half_word>::digits;
+    };
+    // Here we do a one digit multiplication where 'a' and 'b' are of type
+    // word. We split 'a' and 'b' into half words and perform the classic long
+    // multiplication with 'a' and 'b' being two-digit numbers.
+
+    //    a      a_hi a_lo
+    //  x b => x b_hi b_lo
+    // ----    -----------
+    //    c         result
+    // We convert 'lo' and 'hi' from 'half_word' to 'word' so multiplication
+    // doesn't overflow.
+    const word a_lo = lo(a);
+    const word b_lo = lo(b);
+    const word a_hi = hi(a);
+    const word b_hi = hi(b);
+    const word step1 = b_lo * a_lo; // no overflow;
+    const word step2 = b_lo * a_hi; // no overflow;
+    const word step3 = b_hi * a_lo; // no overflow;
+    const word step4 = b_hi * a_hi; // no overflow;
+    word lo_digit = step1;
+    word hi_digit = step4;
+    const word no_carry = 0;
+    word carry;
+    word _; // unused carry variable.
+    lo_digit = add_with_carry<word>(lo_digit, shiftl(step2), no_carry, carry);
+    hi_digit = add_with_carry<word>(hi_digit, shiftr(step2), carry, _);
+    lo_digit = add_with_carry<word>(lo_digit, shiftl(step3), no_carry, carry);
+    hi_digit = add_with_carry<word>(hi_digit, shiftr(step3), carry, _);
+    return DoubleWide<word>(lo_digit, hi_digit);
+  }
+}
+
+// In-place 'dst op= rhs' with operation with carry propagation. Returns carry.
+template <typename Function, typename word, size_t N, size_t M>
+LIBC_INLINE constexpr word inplace_binop(Function op_with_carry,
+                                         cpp::array<word, N> &dst,
+                                         const cpp::array<word, M> &rhs) {
+  static_assert(N >= M);
+  word carry_out = 0;
+  for (size_t i = 0; i < N; ++i) {
+    const bool has_rhs_value = i < M;
+    const word rhs_value = has_rhs_value ? rhs[i] : 0;
+    const word carry_in = carry_out;
+    dst[i] = op_with_carry(dst[i], rhs_value, carry_in, carry_out);
+    // stop early when rhs is over and no carry is to be propagated.
+    if (!has_rhs_value && carry_out == 0)
+      break;
+  }
+  return carry_out;
+}
 
-  prod.lo = pa.lo * pb.lo;                    // exact
-  prod.hi = pa.hi * pb.hi;                    // exact
-  NumberPair<T> lo_hi = split(pa.lo * pb.hi); // exact
-  NumberPair<T> hi_lo = split(pa.hi * pb.lo); // exact
+// In-place addition. Returns carry.
+template <typename word, size_t N, size_t M>
+LIBC_INLINE constexpr word add_with_carry(cpp::array<word, N> &dst,
+                                          const cpp::array<word, M> &rhs) {
+  return inplace_binop(LIBC_NAMESPACE::add_with_carry<word>, dst, rhs);
+}
+
+// In-place subtraction. Returns borrow.
+template <typename word, size_t N, size_t M>
+LIBC_INLINE constexpr word sub_with_borrow(cpp::array<word, N> &dst,
+                                           const cpp::array<word, M> &rhs) {
+  return inplace_binop(LIBC_NAMESPACE::sub_with_borrow<word>, dst, rhs);
+}
+
+// In-place multiply-add. Returns carry.
+// i.e., 'dst += b * c'
+template <typename word, size_t N>
+LIBC_INLINE constexpr word mul_add_with_carry(cpp::array<word, N> &dst, word b,
+                                              word c) {
+  return add_with_carry(dst, mul2(b, c));
+}
 
-  constexpr size_t HALF_BIT_WIDTH = sizeof(T) * CHAR_BIT / 2;
+// An array of two elements serving as an accumulator during multiword
+// computations.
+template <typename T> struct Accumulator final : cpp::array<T, 2> {
+  using UP = cpp::array<T, 2>;
+  LIBC_INLINE constexpr Accumulator() : UP({0, 0}) {}
+  LIBC_INLINE constexpr T advance(T carry_in) {
+    auto result = UP::front();
+    UP::front() = UP::back();
+    UP::back() = carry_in;
+    return result;
+  }
+  LIBC_INLINE constexpr T sum() const { return UP::front(); }
+  LIBC_INLINE constexpr T carry() const { return UP::back(); }
+};
 
-  auto r1 = add_with_carry(prod.lo, lo_hi.lo << HALF_BIT_WIDTH, T(0));
-  prod.lo = r1.sum;
-  prod.hi = add_with_carry(prod.hi, lo_hi.hi, r1.carry).sum;
+// In-place multiplication by a single word. Returns carry.
+template <typename word, size_t N>
+LIBC_INLINE constexpr word scalar_multiply_with_carry(cpp::array<word, N> &dst,
+                                                      word x) {
+  Accumulator<word> acc;
+  for (auto &val : dst) {
+    const word carry = mul_add_with_carry(acc, val, x);
+    val = acc.advance(carry);
+  }
+  return acc.carry();
+}
 
-  auto r2 = add_with_carry(prod.lo, hi_lo.lo << HALF_BIT_WIDTH, T(0));
-  prod.lo = r2.sum;
-  prod.hi = add_with_carry(prod.hi, hi_lo.hi, r2.carry).sum;
+// Multiplication of 'lhs' by 'rhs' into 'dst'. Returns carry.
+// This function is safe to use for signed numbers.
+// https://stackoverflow.com/a/20793834
+// https://pages.cs.wisc.edu/%7Emarkhill/cs354/Fall2008/beyond354/int.mult.html
+template <typename word, size_t O, size_t M, size_t N>
+LIBC_INLINE constexpr word multiply_with_carry(cpp::array<word, O> &dst,
+                                               const cpp::array<word, M> &lhs,
+                                               const cpp::array<word, N> &rhs) {
+  static_assert(O >= M + N);
+  Accumulator<word> acc;
+  for (size_t i = 0; i < O; ++i) {
+    const size_t lower_idx = i < N ? 0 : i - N + 1;
+    const size_t upper_idx = i < M ? i : M - 1;
+    word carry = 0;
+    for (size_t j = lower_idx; j <= upper_idx; ++j)
+      carry += mul_add_with_carry(acc, lhs[j], rhs[i - j]);
+    dst[i] = acc.advance(carry);
+  }
+  return acc.carry();
+}
 
-  return prod;
+template <typename word, size_t N>
+LIBC_INLINE constexpr void quick_mul_hi(cpp::array<word, N> &dst,
+                                        const cpp::array<word, N> &lhs,
+                                        const cpp::array<word, N> &rhs) {
+  Accumulator<word> acc;
+  word carry = 0;
+  // First round of accumulation for those at N - 1 in the full product.
+  for (size_t i = 0; i < N; ++i)
+    carry += mul_add_with_carry(acc, lhs[i], rhs[N - 1 - i]);
+  for (size_t i = N; i < 2 * N - 1; ++i) {
+    acc.advance(carry);
+    carry = 0;
+    for (size_t j = i - N + 1; j < N; ++j)
+      carry += mul_add_with_carry(acc, lhs[j], rhs[i - j]);
+    dst[i - N] = acc.sum();
+  }
+  dst.back() = acc.carry();
 }
 
-template <>
-LIBC_INLINE constexpr NumberPair<uint32_t> full_mul<uint32_t>(uint32_t a,
-                                                              uint32_t b) {
-  uint64_t prod = uint64_t(a) * uint64_t(b);
-  NumberPair<uint32_t> result;
-  result.lo = uint32_t(prod);
-  result.hi = uint32_t(prod >> 32);
-  return result;
+template <typename word, size_t N>
+LIBC_INLINE constexpr bool is_negative(cpp::array<word, N> &array) {
+  using signed_word = cpp::make_signed_t<word>;
+  return cpp::bit_cast<signed_word>(array.back()) < 0;
 }
 
+// An enum for the shift function below.
+enum Direction { LEFT, RIGHT };
+
+// A bitwise shift on an array of elements.
+// TODO: Make the result UB when 'offset' is greater or equal to the number of
+// bits in 'array'. This will allow for better code performance.
+template <Direction direction, bool is_signed, typename word, size_t N>
+LIBC_INLINE constexpr cpp::array<word, N> shift(cpp::array<word, N> array,
+                                                size_t offset) {
+  static_assert(direction == LEFT || direction == RIGHT);
+  constexpr size_t WORD_BITS = cpp::numeric_limits<word>::digits;
+  constexpr size_t TOTAL_BITS = N * WORD_BITS;
+  if (LIBC_UNLIKELY(offset == 0))
+    return array;
+  if (LIBC_UNLIKELY(offset >= TOTAL_BITS))
+    return {};
 #ifdef LIBC_TYPES_HAS_INT128
-template <>
-LIBC_INLINE constexpr NumberPair<uint64_t> full_mul<uint64_t>(uint64_t a,
-                                                              uint64_t b) {
-  __uint128_t prod = __uint128_t(a) * __uint128_t(b);
-  NumberPair<uint64_t> result;
-  result.lo = uint64_t(prod);
-  result.hi = uint64_t(prod >> 64);
-  return result;
+  if constexpr (TOTAL_BITS == 128) {
+    using type = cpp::conditional_t<is_signed, __int128_t, __uint128_t>;
+    auto tmp = cpp::bit_cast<type>(array);
+    if constexpr (direction == LEFT)
+      tmp <<= offset;
+    else
+      tmp >>= offset;
+    return cpp::bit_cast<cpp::array<word, N>>(tmp);
+  }
+#endif
+  const bool is_neg = is_signed && is_negative(array);
+  constexpr auto at = [](size_t index) -> int {
+    // reverse iteration when direction == LEFT.
+    if constexpr (direction == LEFT)
+      return int(N) - int(index) - 1;
+    return int(index);
+  };
+  const auto safe_get_at = [&](size_t index) -> word {
+    // return appropriate value when accessing out of bound elements.
+    const int i = at(index);
+    if (i < 0)
+      return 0;
+    if (i >= int(N))
+      return is_neg ? -1 : 0;
+    return array[i];
+  };
+  const size_t index_offset = offset / WORD_BITS;
+  const size_t bit_offset = offset % WORD_BITS;
+#ifdef LIBC_COMPILER_IS_CLANG
+  __builtin_assume(index_offset < N);
+#endif
+  cpp::array<word, N> out = {};
+  for (size_t index = 0; index < N; ++index) {
+    const word part1 = safe_get_at(index + index_offset);
+    const word part2 = safe_get_at(index + index_offset + 1);
+    word &dst = out[at(index)];
+    if (bit_offset == 0)
+      dst = part1; // no crosstalk between parts.
+    else if constexpr (direction == LEFT)
+      dst = (part1 << bit_offset) | (part2 >> (WORD_BITS - bit_offset));
+    else
+      dst = (part1 >> bit_offset) | (part2 << (WORD_BITS - bit_offset));
+  }
+  return out;
 }
-#endif // LIBC_TYPES_HAS_INT128
 
-} // namespace internal
+#define DECLARE_COUNTBIT(NAME, INDEX_EXPR)                                     \
+  template <typename word, size_t N>                                           \
+  LIBC_INLINE constexpr int NAME(const cpp::array<word, N> &val) {             \
+    int bit_count = 0;                                                         \
+    for (size_t i = 0; i < N; ++i) {                                           \
+      const int word_count = cpp::NAME<word>(val[INDEX_EXPR]);                 \
+      bit_count += word_count;                                                 \
+      if (word_count != cpp::numeric_limits<word>::digits)                     \
+        break;                                                                 \
+    }                                                                          \
+    return bit_count;                                                          \
+  }
+
+DECLARE_COUNTBIT(countr_zero, i)         // iterating forward
+DECLARE_COUNTBIT(countr_one, i)          // iterating forward
+DECLARE_COUNTBIT(countl_zero, N - i - 1) // iterating backward
+DECLARE_COUNTBIT(countl_one, N - i - 1)  // iterating backward
+
+} // namespace multiword
 
 template <size_t Bits, bool Signed, typename WordType = uint64_t>
 struct BigInt {
+private:
   static_assert(cpp::is_integral_v<WordType> && cpp::is_unsigned_v<WordType>,
                 "WordType must be unsigned integer.");
 
+  struct Division {
+    BigInt quotient;
+    BigInt remainder;
+  };
+
+public:
   using word_type = WordType;
+  using unsigned_type = BigInt<Bits, false, word_type>;
+  using signed_type = BigInt<Bits, true, word_type>;
+
   LIBC_INLINE_VAR static constexpr bool SIGNED = Signed;
   LIBC_INLINE_VAR static constexpr size_t BITS = Bits;
   LIBC_INLINE_VAR
@@ -100,10 +354,7 @@ struct BigInt {
 
   LIBC_INLINE_VAR static constexpr size_t WORD_COUNT = Bits / WORD_SIZE;
 
-  using unsigned_type = BigInt<BITS, false, word_type>;
-  using signed_type = BigInt<BITS, true, word_type>;
-
-  cpp::array<WordType, WORD_COUNT> val{};
+  cpp::array<WordType, WORD_COUNT> val{}; // zero initialized.
 
   LIBC_INLINE constexpr BigInt() = default;
 
@@ -112,76 +363,67 @@ struct BigInt {
   template <size_t OtherBits, bool OtherSigned>
   LIBC_INLINE constexpr BigInt(
       const BigInt<OtherBits, OtherSigned, WordType> &other) {
-    if (OtherBits >= Bits) {
+    if (OtherBits >= Bits) { // truncate
       for (size_t i = 0; i < WORD_COUNT; ++i)
         val[i] = other[i];
-    } else {
+    } else { // zero or sign extend
       size_t i = 0;
       for (; i < OtherBits / WORD_SIZE; ++i)
         val[i] = other[i];
-      WordType sign = 0;
-      if constexpr (Signed && OtherSigned) {
-        sign = static_cast<WordType>(
-            -static_cast<cpp::make_signed_t<WordType>>(other.is_neg()));
-      }
-      for (; i < WORD_COUNT; ++i)
-        val[i] = sign;
+      extend(i, Signed && other.is_neg());
     }
   }
 
   // Construct a BigInt from a C array.
-  template <size_t N, cpp::enable_if_t<N <= WORD_COUNT, int> = 0>
-  LIBC_INLINE constexpr BigInt(const WordType (&nums)[N]) {
-    size_t min_wordcount = N < WORD_COUNT ? N : WORD_COUNT;
-    size_t i = 0;
-    for (; i < min_wordcount; ++i)
+  template <size_t N> LIBC_INLINE constexpr BigInt(const WordType (&nums)[N]) {
+    static_assert(N == WORD_COUNT);
+    for (size_t i = 0; i < WORD_COUNT; ++i)
       val[i] = nums[i];
+  }
 
-    // If nums doesn't completely fill val, then fill the rest with zeroes.
-    for (; i < WORD_COUNT; ++i)
-      val[i] = 0;
+  LIBC_INLINE constexpr explicit BigInt(
+      const cpp::array<WordType, WORD_COUNT> &words) {
+    val = words;
   }
 
   // Initialize the first word to |v| and the rest to 0.
   template <typename T, typename = cpp::enable_if_t<cpp::is_integral_v<T>>>
   LIBC_INLINE constexpr BigInt(T v) {
-    val[0] = static_cast<WordType>(v);
-
-    if constexpr (WORD_COUNT == 1)
-      return;
-
-    if constexpr (Bits < sizeof(T) * CHAR_BIT) {
-      for (int i = 1; i < WORD_COUNT; ++i) {
-        v >>= WORD_SIZE;
-        val[i] = static_cast<WordType>(v);
+    constexpr size_t T_SIZE = sizeof(T) * CHAR_BIT;
+    const bool is_neg = Signed && (v < 0);
+    for (size_t i = 0; i < WORD_COUNT; ++i) {
+      if (v == 0) {
+        extend(i, is_neg);
+        return;
       }
-      return;
-    }
-
-    size_t i = 1;
-
-    if constexpr (WORD_SIZE < sizeof(T) * CHAR_BIT)
-      for (; i < sizeof(T) * CHAR_BIT / WORD_SIZE; ++i) {
+      val[i] = static_cast<WordType>(v);
+      if constexpr (T_SIZE > WORD_SIZE)
         v >>= WORD_SIZE;
-        val[i] = static_cast<WordType>(v);
-      }
-
-    WordType sign = (Signed && (v < 0)) ? ~WordType(0) : WordType(0);
-    for (; i < WORD_COUNT; ++i) {
-      val[i] = sign;
+      else
+        v = 0;
     }
   }
+  LIBC_INLINE constexpr BigInt &operator=(const BigInt &other) = default;
 
-  LIBC_INLINE constexpr explicit BigInt(
-      const cpp::array<WordType, WORD_COUNT> &words) {
-    for (size_t i = 0; i < WORD_COUNT; ++i)
-      val[i] = words[i];
+  // constants
+  LIBC_INLINE static constexpr BigInt zero() { return BigInt(); }
+  LIBC_INLINE static constexpr BigInt one() { return BigInt(1); }
+  LIBC_INLINE static constexpr BigInt all_ones() { return ~zero(); }
+  LIBC_INLINE static constexpr BigInt min() {
+    BigInt out;
+    if constexpr (SIGNED)
+      out.set_msb();
+    return out;
+  }
+  LIBC_INLINE static constexpr BigInt max() {
+    BigInt out = all_ones();
+    if constexpr (SIGNED)
+      out.clear_msb();
+    return out;
   }
 
   // TODO: Reuse the Sign type.
-  LIBC_INLINE constexpr bool is_neg() const {
-    return val.back() >> (WORD_SIZE - 1);
-  }
+  LIBC_INLINE constexpr bool is_neg() const { return SIGNED && get_msb(); }
 
   template <typename T> LIBC_INLINE constexpr explicit operator T() const {
     return to<T>();
@@ -191,200 +433,100 @@ struct BigInt {
   LIBC_INLINE constexpr cpp::enable_if_t<
       cpp::is_integral_v<T> && !cpp::is_same_v<T, bool>, T>
   to() const {
+    constexpr size_t T_SIZE = sizeof(T) * CHAR_BIT;
     T lo = static_cast<T>(val[0]);
-
-    constexpr size_t T_BITS = sizeof(T) * CHAR_BIT;
-
-    if constexpr (T_BITS <= WORD_SIZE)
+    if constexpr (T_SIZE <= WORD_SIZE)
       return lo;
-
     constexpr size_t MAX_COUNT =
-        T_BITS > Bits ? WORD_COUNT : T_BITS / WORD_SIZE;
+        T_SIZE > Bits ? WORD_COUNT : T_SIZE / WORD_SIZE;
     for (size_t i = 1; i < MAX_COUNT; ++i)
       lo += static_cast<T>(val[i]) << (WORD_SIZE * i);
-
-    if constexpr (Signed && (T_BITS > Bits)) {
+    if constexpr (Signed && (T_SIZE > Bits)) {
       // Extend sign for negative numbers.
       constexpr T MASK = (~T(0) << Bits);
       if (is_neg())
         lo |= MASK;
     }
-
     return lo;
   }
 
   LIBC_INLINE constexpr explicit operator bool() const { return !is_zero(); }
 
-  LIBC_INLINE constexpr BigInt &operator=(const BigInt &other) = default;
-
   LIBC_INLINE constexpr bool is_zero() const {
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      if (val[i] != 0)
+    for (auto part : val)
+      if (part != 0)
         return false;
-    }
     return true;
   }
 
-  // Add x to this number and store the result in this number.
+  // Add 'rhs' to this number and store the result in this number.
   // Returns the carry value produced by the addition operation.
-  LIBC_INLINE constexpr WordType add(const BigInt &x) {
-    SumCarry<WordType> s{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      s = add_with_carry(val[i], x.val[i], s.carry);
-      val[i] = s.sum;
-    }
-    return s.carry;
+  LIBC_INLINE constexpr WordType add_overflow(const BigInt &rhs) {
+    return multiword::add_with_carry(val, rhs.val);
   }
 
   LIBC_INLINE constexpr BigInt operator+(const BigInt &other) const {
-    BigInt result;
-    SumCarry<WordType> s{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      s = add_with_carry(val[i], other.val[i], s.carry);
-      result.val[i] = s.sum;
-    }
+    BigInt result = *this;
+    result.add_overflow(other);
     return result;
   }
 
   // This will only apply when initializing a variable from constant values, so
   // it will always use the constexpr version of add_with_carry.
   LIBC_INLINE constexpr BigInt operator+(BigInt &&other) const {
-    BigInt result;
-    SumCarry<WordType> s{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      s = add_with_carry(val[i], other.val[i], s.carry);
-      result.val[i] = s.sum;
-    }
-    return result;
+    // We use addition commutativity to reuse 'other' and prevent allocation.
+    other.add_overflow(*this); // Returned carry value is ignored.
+    return other;
   }
 
   LIBC_INLINE constexpr BigInt &operator+=(const BigInt &other) {
-    add(other); // Returned carry value is ignored.
+    add_overflow(other); // Returned carry value is ignored.
     return *this;
   }
 
-  // Subtract x to this number and store the result in this number.
+  // Subtract 'rhs' to this number and store the result in this number.
   // Returns the carry value produced by the subtraction operation.
-  LIBC_INLINE constexpr WordType sub(const BigInt &x) {
-    DiffBorrow<WordType> d{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      d = sub_with_borrow(val[i], x.val[i], d.borrow);
-      val[i] = d.diff;
-    }
-    return d.borrow;
+  LIBC_INLINE constexpr WordType sub_overflow(const BigInt &rhs) {
+    return multiword::sub_with_borrow(val, rhs.val);
   }
 
   LIBC_INLINE constexpr BigInt operator-(const BigInt &other) const {
-    BigInt result;
-    DiffBorrow<WordType> d{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      d = sub_with_borrow(val[i], other.val[i], d.borrow);
-      result.val[i] = d.diff;
-    }
+    BigInt result = *this;
+    result.sub_overflow(other); // Returned carry value is ignored.
     return result;
   }
 
   LIBC_INLINE constexpr BigInt operator-(BigInt &&other) const {
-    BigInt result;
-    DiffBorrow<WordType> d{0, 0};
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      d = sub_with_borrow(val[i], other.val[i], d.borrow);
-      result.val[i] = d.diff;
-    }
+    BigInt result = *this;
+    result.sub_overflow(other); // Returned carry value is ignored.
     return result;
   }
 
   LIBC_INLINE constexpr BigInt &operator-=(const BigInt &other) {
     // TODO(lntue): Set overflow flag / errno when carry is true.
-    sub(other);
+    sub_overflow(other); // Returned carry value is ignored.
     return *this;
   }
 
-  // Multiply this number with x and store the result in this number. It is
-  // implemented using the long multiplication algorithm by splitting the
-  // 64-bit words of this number and |x| in to 32-bit halves but peforming
-  // the operations using 64-bit numbers. This ensures that we don't lose the
-  // carry bits.
-  // Returns the carry value produced by the multiplication operation.
+  // Multiply this number with x and store the result in this number.
   LIBC_INLINE constexpr WordType mul(WordType x) {
-    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      NumberPair<WordType> prod = internal::full_mul(val[i], x);
-      BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-      const WordType carry = partial_sum.add(tmp);
-      val[i] = partial_sum.val[0];
-      partial_sum.val[0] = partial_sum.val[1];
-      partial_sum.val[1] = carry;
-    }
-    return partial_sum.val[1];
-  }
-
-  LIBC_INLINE constexpr BigInt operator*(const BigInt &other) const {
-    if constexpr (Signed) {
-      BigInt<Bits, false, WordType> a(*this);
-      BigInt<Bits, false, WordType> b(other);
-      const bool a_neg = a.is_neg();
-      const bool b_neg = b.is_neg();
-      if (a_neg)
-        a = -a;
-      if (b_neg)
-        b = -b;
-      BigInt<Bits, false, WordType> prod = a * b;
-      if (a_neg != b_neg)
-        prod = -prod;
-      return static_cast<BigInt<Bits, true, WordType>>(prod);
-    } else {
-      if constexpr (WORD_COUNT == 1) {
-        return {val[0] * other.val[0]};
-      } else {
-        BigInt result(0);
-        BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-        WordType carry = 0;
-        for (size_t i = 0; i < WORD_COUNT; ++i) {
-          for (size_t j = 0; j <= i; j++) {
-            NumberPair<WordType> prod =
-                internal::full_mul(val[j], other.val[i - j]);
-            BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-            carry += partial_sum.add(tmp);
-          }
-          result.val[i] = partial_sum.val[0];
-          partial_sum.val[0] = partial_sum.val[1];
-          partial_sum.val[1] = carry;
-          carry = 0;
-        }
-        return result;
-      }
-    }
+    return multiword::scalar_multiply_with_carry(val, x);
   }
 
-  // Return the full product, only unsigned for now.
+  // Return the full product.
   template <size_t OtherBits>
-  LIBC_INLINE constexpr BigInt<Bits + OtherBits, Signed, WordType>
+  LIBC_INLINE constexpr auto
   ful_mul(const BigInt<OtherBits, Signed, WordType> &other) const {
-    BigInt<Bits + OtherBits, Signed, WordType> result(0);
-    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-    WordType carry = 0;
-    constexpr size_t OTHER_WORDCOUNT =
-        BigInt<OtherBits, Signed, WordType>::WORD_COUNT;
-    for (size_t i = 0; i <= WORD_COUNT + OTHER_WORDCOUNT - 2; ++i) {
-      const size_t lower_idx =
-          i < OTHER_WORDCOUNT ? 0 : i - OTHER_WORDCOUNT + 1;
-      const size_t upper_idx = i < WORD_COUNT ? i : WORD_COUNT - 1;
-      for (size_t j = lower_idx; j <= upper_idx; ++j) {
-        NumberPair<WordType> prod =
-            internal::full_mul(val[j], other.val[i - j]);
-        BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-        carry += partial_sum.add(tmp);
-      }
-      result.val[i] = partial_sum.val[0];
-      partial_sum.val[0] = partial_sum.val[1];
-      partial_sum.val[1] = carry;
-      carry = 0;
-    }
-    result.val[WORD_COUNT + OTHER_WORDCOUNT - 1] = partial_sum.val[0];
+    BigInt<Bits + OtherBits, Signed, WordType> result;
+    multiword::multiply_with_carry(result.val, val, other.val);
     return result;
   }
 
+  LIBC_INLINE constexpr BigInt operator*(const BigInt &other) const {
+    // Perform full mul and truncate.
+    return BigInt(ful_mul(other));
+  }
+
   // Fast hi part of the full product.  The normal product `operator*` returns
   // `Bits` least significant bits of the full product, while this function will
   // approximate `Bits` most significant bits of the full product with errors
@@ -407,39 +549,17 @@ struct BigInt {
   //    256      4        16          10            3
   //    512      8        64          36            7
   LIBC_INLINE constexpr BigInt quick_mul_hi(const BigInt &other) const {
-    BigInt result(0);
-    BigInt<2 * WORD_SIZE, Signed, WordType> partial_sum(0);
-    WordType carry = 0;
-    // First round of accumulation for those at WORD_COUNT - 1 in the full
-    // product.
-    for (size_t i = 0; i < WORD_COUNT; ++i) {
-      NumberPair<WordType> prod =
-          internal::full_mul(val[i], other.val[WORD_COUNT - 1 - i]);
-      BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-      carry += partial_sum.add(tmp);
-    }
-    for (size_t i = WORD_COUNT; i < 2 * WORD_COUNT - 1; ++i) {
-      partial_sum.val[0] = partial_sum.val[1];
-      partial_sum.val[1] = carry;
-      carry = 0;
-      for (size_t j = i - WORD_COUNT + 1; j < WORD_COUNT; ++j) {
-        NumberPair<WordType> prod =
-            internal::full_mul(val[j], other.val[i - j]);
-        BigInt<2 * WORD_SIZE, Signed, WordType> tmp({prod.lo, prod.hi});
-        carry += partial_sum.add(tmp);
-      }
-      result.val[i - WORD_COUNT] = partial_sum.val[0];
-    }
-    result.val[WORD_COUNT - 1] = partial_sum.val[1];
+    BigInt result;
+    multiword::quick_mul_hi(result.val, val, other.val);
     return result;
   }
 
-  // pow takes a power and sets this to its starting value to that power. Zero
-  // to the zeroth power returns 1.
+  // BigInt(x).pow_n(n) computes x ^ n.
+  // Note 0 ^ 0 == 1.
   LIBC_INLINE constexpr void pow_n(uint64_t power) {
-    BigInt result = 1;
+    static_assert(!Signed);
+    BigInt result = one();
     BigInt cur_power = *this;
-
     while (power > 0) {
       if ((power % 2) > 0)
         result *= cur_power;
@@ -449,38 +569,23 @@ struct BigInt {
     *this = result;
   }
 
-  // TODO: Make division work correctly for signed integers.
-
-  // div takes another BigInt of the same size and divides this by it. The value
-  // of this will be set to the quotient, and the return value is the remainder.
-  LIBC_INLINE constexpr cpp::optional<BigInt> div(const BigInt &other) {
-    BigInt remainder(0);
-    if (*this < other) {
-      remainder = *this;
-      *this = BigInt(0);
-      return remainder;
-    }
-    if (other == 1) {
-      return remainder;
-    }
-    if (other == 0) {
+  // Performs inplace signed / unsigned division. Returns remainder if not
+  // dividing by zero.
+  // For signed numbers it behaves like C++ signed integer division.
+  // That is by truncating the fractionnal part
+  // https://stackoverflow.com/a/3602857
+  LIBC_INLINE constexpr cpp::optional<BigInt> div(const BigInt &divider) {
+    if (LIBC_UNLIKELY(divider.is_zero()))
       return cpp::nullopt;
-    }
-
-    BigInt quotient(0);
-    BigInt subtractor = other;
-    int cur_bit = static_cast<int>(subtractor.clz() - this->clz());
-    subtractor.shift_left(cur_bit);
-
-    for (; cur_bit >= 0 && *this > 0; --cur_bit, subtractor.shift_right(1)) {
-      if (*this >= subtractor) {
-        this->sub(subtractor);
-        quotient = quotient | (BigInt(1) << cur_bit);
-      }
-    }
-    remainder = *this;
-    *this = quotient;
-    return remainder;
+    if (LIBC_UNLIKELY(divider == BigInt::one()))
+      return BigInt::zero();
+    Division result;
+    if constexpr (SIGNED)
+      result = divide_signed(*this, divider);
+    else
+      result = divide_unsigned(*this, divider);
+    *this = result.quotient;
+    return result.remainder;
   }
 
   // Efficiently perform BigInt / (x * 2^e), where x is a half-word-size
@@ -496,19 +601,16 @@ struct BigInt {
   // computation of each step is now properly contained within WordType.
   //   And finally we perform some extra alignment steps for the remaining bits.
   LIBC_INLINE constexpr cpp::optional<BigInt>
-  div_uint_half_times_pow_2(internal::half_width_t<WordType> x, size_t e) {
-    BigInt remainder(0);
-
-    if (x == 0) {
+  div_uint_half_times_pow_2(multiword::half_width_t<WordType> x, size_t e) {
+    BigInt remainder;
+    if (x == 0)
       return cpp::nullopt;
-    }
     if (e >= Bits) {
       remainder = *this;
-      *this = BigInt<Bits, false, WordType>(0);
+      *this = BigInt<Bits, false, WordType>();
       return remainder;
     }
-
-    BigInt quotient(0);
+    BigInt quotient;
     WordType x_word = static_cast<WordType>(x);
     constexpr size_t LOG2_WORD_SIZE = cpp::bit_width(WORD_SIZE) - 1;
     constexpr size_t HALF_WORD_SIZE = WORD_SIZE >> 1;
@@ -633,189 +735,22 @@ struct BigInt {
     return *this;
   }
 
-  // TODO: remove and use cpp::countl_zero below.
-  [[nodiscard]] LIBC_INLINE constexpr int clz() const {
-    constexpr int word_digits = cpp::numeric_limits<word_type>::digits;
-    int leading_zeroes = 0;
-    for (auto i = val.size(); i > 0;) {
-      --i;
-      const int zeroes = cpp::countl_zero(val[i]);
-      leading_zeroes += zeroes;
-      if (zeroes != word_digits)
-        break;
-    }
-    return leading_zeroes;
-  }
-
-  // TODO: remove and use cpp::countr_zero below.
-  [[nodiscard]] LIBC_INLINE constexpr int ctz() const {
-    constexpr int word_digits = cpp::numeric_limits<word_type>::digits;
-    int trailing_zeroes = 0;
-    for (auto word : val) {
-      const int zeroes = cpp::countr_zero(word);
-      trailing_zeroes += zeroes;
-      if (zeroes != word_digits)
-        break;
-    }
-    return trailing_zeroes;
-  }
-
-  LIBC_INLINE constexpr void shift_left(size_t s) {
-    if constexpr (Bits == WORD_SIZE) {
-      // Use native types if possible.
-      if (s >= WORD_SIZE) {
-        val[0] = 0;
-        return;
-      }
-      val[0] <<= s;
-      return;
-    }
-    if constexpr ((Bits == 64) && (WORD_SIZE == 32)) {
-      // Use builtin 64 bits for 32-bit base type if available;
-      if (s >= 64) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      uint64_t tmp = uint64__t(val[0]) + (uint64_t(val[1]) << 62);
-      tmp <<= s;
-      val[0] = uint32_t(tmp);
-      val[1] = uint32_t(tmp >> 32);
-      return;
-    }
-#ifdef LIBC_TYPES_HAS_INT128
-    if constexpr ((Bits == 128) && (WORD_SIZE == 64)) {
-      // Use builtin 128 bits if available;
-      if (s >= 128) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      __uint128_t tmp = __uint128_t(val[0]) + (__uint128_t(val[1]) << 64);
-      tmp <<= s;
-      val[0] = uint64_t(tmp);
-      val[1] = uint64_t(tmp >> 64);
-      return;
-    }
-#endif // LIBC_TYPES_HAS_INT128
-    if (LIBC_UNLIKELY(s == 0))
-      return;
-
-    const size_t drop = s / WORD_SIZE;  // Number of words to drop
-    const size_t shift = s % WORD_SIZE; // Bits to shift in the remaining words.
-    size_t i = WORD_COUNT;
-
-    if (drop < WORD_COUNT) {
-      i = WORD_COUNT - 1;
-      if (shift > 0) {
-        for (size_t j = WORD_COUNT - 1 - drop; j > 0; --i, --j) {
-          val[i] = (val[j] << shift) | (val[j - 1] >> (WORD_SIZE - shift));
-        }
-        val[i] = val[0] << shift;
-      } else {
-        for (size_t j = WORD_COUNT - 1 - drop; j > 0; --i, --j) {
-          val[i] = val[j];
-        }
-        val[i] = val[0];
-      }
-    }
-
-    for (size_t j = 0; j < i; ++j) {
-      val[j] = 0;
-    }
+  LIBC_INLINE constexpr BigInt &operator<<=(size_t s) {
+    val = multiword::shift<multiword::LEFT, SIGNED>(val, s);
+    return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator<<(size_t s) const {
-    BigInt result(*this);
-    result.shift_left(s);
-    return result;
+    return BigInt(multiword::shift<multiword::LEFT, SIGNED>(val, s));
   }
 
-  LIBC_INLINE constexpr BigInt &operator<<=(size_t s) {
-    shift_left(s);
+  LIBC_INLINE constexpr BigInt &operator>>=(size_t s) {
+    val = multiword::shift<multiword::RIGHT, SIGNED>(val, s);
     return *this;
   }
 
-  LIBC_INLINE constexpr void shift_right(size_t s) {
-    if constexpr ((Bits == 64) && (WORD_SIZE == 32)) {
-      // Use builtin 64 bits if available;
-      if (s >= 64) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      uint64_t tmp = uint64_t(val[0]) + (uint64_t(val[1]) << 32);
-      if constexpr (Signed) {
-        tmp = static_cast<uint64_t>(static_cast<int64_t>(tmp) >> s);
-      } else {
-        tmp >>= s;
-      }
-      val[0] = uint32_t(tmp);
-      val[1] = uint32_t(tmp >> 32);
-      return;
-    }
-#ifdef LIBC_TYPES_HAS_INT128
-    if constexpr ((Bits == 128) && (WORD_SIZE == 64)) {
-      // Use builtin 128 bits if available;
-      if (s >= 128) {
-        val[0] = 0;
-        val[1] = 0;
-        return;
-      }
-      __uint128_t tmp = __uint128_t(val[0]) + (__uint128_t(val[1]) << 64);
-      if constexpr (Signed) {
-        tmp = static_cast<__uint128_t>(static_cast<__int128_t>(tmp) >> s);
-      } else {
-        tmp >>= s;
-      }
-      val[0] = uint64_t(tmp);
-      val[1] = uint64_t(tmp >> 64);
-      return;
-    }
-#endif // LIBC_TYPES_HAS_INT128
-
-    if (LIBC_UNLIKELY(s == 0))
-      return;
-    const size_t drop = s / WORD_SIZE;  // Number of words to drop
-    const size_t shift = s % WORD_SIZE; // Bit shift in the remaining words.
-
-    size_t i = 0;
-    WordType sign = Signed ? is_neg() : 0;
-
-    if (drop < WORD_COUNT) {
-      if (shift > 0) {
-        for (size_t j = drop; j < WORD_COUNT - 1; ++i, ++j) {
-          val[i] = (val[j] >> shift) | (val[j + 1] << (WORD_SIZE - shift));
-        }
-        if constexpr (Signed) {
-          val[i] = static_cast<WordType>(
-              static_cast<cpp::make_signed_t<WordType>>(val[WORD_COUNT - 1]) >>
-              shift);
-        } else {
-          val[i] = val[WORD_COUNT - 1] >> shift;
-        }
-        ++i;
-      } else {
-        for (size_t j = drop; j < WORD_COUNT; ++i, ++j) {
-          val[i] = val[j];
-        }
-      }
-    }
-
-    for (; i < WORD_COUNT; ++i) {
-      val[i] = sign;
-    }
-  }
-
   LIBC_INLINE constexpr BigInt operator>>(size_t s) const {
-    BigInt result(*this);
-    result.shift_right(s);
-    return result;
-  }
-
-  LIBC_INLINE constexpr BigInt &operator>>=(size_t s) {
-    shift_right(s);
-    return *this;
+    return BigInt(multiword::shift<multiword::RIGHT, SIGNED>(val, s));
   }
 
 #define DEFINE_BINOP(OP)                                                       \
@@ -833,10 +768,9 @@ struct BigInt {
     return lhs;                                                                \
   }
 
-  DEFINE_BINOP(&)
-  DEFINE_BINOP(|)
-  DEFINE_BINOP(^)
-
+  DEFINE_BINOP(&) // & and &=
+  DEFINE_BINOP(|) // | and |=
+  DEFINE_BINOP(^) // ^ and ^=
 #undef DEFINE_BINOP
 
   LIBC_INLINE constexpr BigInt operator~() const {
@@ -847,8 +781,8 @@ struct BigInt {
   }
 
   LIBC_INLINE constexpr BigInt operator-() const {
-    BigInt result = ~(*this);
-    result.add(BigInt(1));
+    BigInt result(*this);
+    result.negate();
     return result;
   }
 
@@ -865,24 +799,6 @@ struct BigInt {
     return !(lhs == rhs);
   }
 
-private:
-  LIBC_INLINE friend constexpr int cmp(const BigInt &lhs, const BigInt &rhs) {
-    const auto compare = [](WordType a, WordType b) {
-      return a == b ? 0 : a > b ? 1 : -1;
-    };
-    if constexpr (Signed) {
-      const bool lhs_is_neg = lhs.is_neg();
-      const bool rhs_is_neg = rhs.is_neg();
-      if (lhs_is_neg != rhs_is_neg)
-        return rhs_is_neg ? 1 : -1;
-    }
-    for (size_t i = WORD_COUNT; i-- > 0;)
-      if (auto cmp = compare(lhs[i], rhs[i]); cmp != 0)
-        return cmp;
-    return 0;
-  }
-
-public:
   LIBC_INLINE friend constexpr bool operator>(const BigInt &lhs,
                                               const BigInt &rhs) {
     return cmp(lhs, rhs) > 0;
@@ -901,24 +817,24 @@ public:
   }
 
   LIBC_INLINE constexpr BigInt &operator++() {
-    add(BigInt(1));
+    increment();
     return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator++(int) {
     BigInt oldval(*this);
-    add(BigInt(1));
+    increment();
     return oldval;
   }
 
   LIBC_INLINE constexpr BigInt &operator--() {
-    sub(BigInt(1));
+    decrement();
     return *this;
   }
 
   LIBC_INLINE constexpr BigInt operator--(int) {
     BigInt oldval(*this);
-    sub(BigInt(1));
+    decrement();
     return oldval;
   }
 
@@ -930,9 +846,117 @@ public:
   // Return the i-th word of the number.
   LIBC_INLINE constexpr WordType &operator[](size_t i) { return val[i]; }
 
-  LIBC_INLINE WordType *data() { return val; }
+private:
+  LIBC_INLINE friend constexpr int cmp(const BigInt &lhs, const BigInt &rhs) {
+    constexpr auto compare = [](WordType a, WordType b) {
+      return a == b ? 0 : a > b ? 1 : -1;
+    };
+    if constexpr (Signed) {
+      const bool lhs_is_neg = lhs.is_neg();
+      const bool rhs_is_neg = rhs.is_neg();
+      if (lhs_is_neg != rhs_is_neg)
+        return rhs_is_neg ? 1 : -1;
+    }
+    for (size_t i = WORD_COUNT; i-- > 0;)
+      if (auto cmp = compare(lhs[i], rhs[i]); cmp != 0)
+        return cmp;
+    return 0;
+  }
+
+  LIBC_INLINE constexpr void bitwise_not() {
+    for (auto &part : val)
+      part = ~part;
+  }
+
+  LIBC_INLINE constexpr void negate() {
+    bitwise_not();
+    increment();
+  }
 
-  LIBC_INLINE const WordType *data() const { return val; }
+  LIBC_INLINE constexpr void increment() {
+    multiword::add_with_carry(val, cpp::array<WordType, 1>{1});
+  }
+
+  LIBC_INLINE constexpr void decrement() {
+    multiword::add_with_carry(val, cpp::array<WordType, 1>{1});
+  }
+
+  LIBC_INLINE constexpr void extend(size_t index, bool is_neg) {
+    const WordType value = is_neg ? cpp::numeric_limits<WordType>::max()
+                                  : cpp::numeric_limits<WordType>::min();
+    for (size_t i = index; i < WORD_COUNT; ++i)
+      val[i] = value;
+  }
+
+  LIBC_INLINE constexpr bool get_msb() const {
+    return val.back() >> (WORD_SIZE - 1);
+  }
+
+  LIBC_INLINE constexpr void set_msb() {
+    val.back() |= mask_leading_ones<WordType, 1>();
+  }
+
+  LIBC_INLINE constexpr void clear_msb() {
+    val.back() &= mask_trailing_ones<WordType, WORD_SIZE - 1>();
+  }
+
+  LIBC_INLINE constexpr void set_bit(size_t i) {
+    const size_t word_index = i / WORD_SIZE;
+    val[word_index] |= WordType(1) << (i % WORD_SIZE);
+  }
+
+  LIBC_INLINE constexpr static Division divide_unsigned(const BigInt &dividend,
+                                                        const BigInt &divider) {
+    BigInt remainder = dividend;
+    BigInt quotient;
+    if (remainder >= divider) {
+      BigInt subtractor = divider;
+      int cur_bit = multiword::countl_zero(subtractor.val) -
+                    multiword::countl_zero(remainder.val);
+      subtractor <<= cur_bit;
+      for (; cur_bit >= 0 && remainder > 0; --cur_bit, subtractor >>= 1) {
+        if (remainder < subtractor)
+          continue;
+        remainder -= subtractor;
+        quotient.set_bit(cur_bit);
+      }
+    }
+    return Division{quotient, remainder};
+  }
+
+  LIBC_INLINE constexpr static Division divide_signed(const BigInt &dividend,
+                                                      const BigInt &divider) {
+    // Special case because it is not possible to negate the min value of a
+    // signed integer.
+    if (dividend == min() && divider == min())
+      return Division{one(), zero()};
+    // 1. Convert the dividend and divisor to unsigned representation.
+    unsigned_type udividend(dividend);
+    unsigned_type udivider(divider);
+    // 2. Negate the dividend if it's negative, and similarly for the divisor.
+    const bool dividend_is_neg = dividend.is_neg();
+    const bool divider_is_neg = divider.is_neg();
+    if (dividend_is_neg)
+      udividend.negate();
+    if (divider_is_neg)
+      udivider.negate();
+    // 3. Use unsigned multiword division algorithm.
+    const auto unsigned_result = divide_unsigned(udividend, udivider);
+    // 4. Convert the quotient and remainder to signed representation.
+    Division result;
+    result.quotient = signed_type(unsigned_result.quotient);
+    result.remainder = signed_type(unsigned_result.remainder);
+    // 5. Negate the quotient if the dividend and divisor had opposite signs.
+    if (dividend_is_neg != divider_is_neg)
+      result.quotient.negate();
+    // 6. Negate the remainder if the dividend was negative.
+    if (dividend_is_neg)
+      result.remainder.negate();
+    return result;
+  }
+
+  friend signed_type;
+  friend unsigned_type;
 };
 
 namespace internal {
@@ -962,10 +986,8 @@ using Int = BigInt<Bits, true, internal::WordTypeSelectorT<Bits>>;
 // Provides limits of U/Int<128>.
 template <> class cpp::numeric_limits<UInt<128>> {
 public:
-  LIBC_INLINE static constexpr UInt<128> max() {
-    return UInt<128>({0xffff'ffff'ffff'ffff, 0xffff'ffff'ffff'ffff});
-  }
-  LIBC_INLINE static constexpr UInt<128> min() { return UInt<128>(0); }
+  LIBC_INLINE static constexpr UInt<128> max() { return UInt<128>::max(); }
+  LIBC_INLINE static constexpr UInt<128> min() { return UInt<128>::min(); }
   // Meant to match std::numeric_limits interface.
   // NOLINTNEXTLINE(readability-identifier-naming)
   LIBC_INLINE_VAR static constexpr int digits = 128;
@@ -973,12 +995,8 @@ public:
 
 template <> class cpp::numeric_limits<Int<128>> {
 public:
-  LIBC_INLINE static constexpr Int<128> max() {
-    return Int<128>({0xffff'ffff'ffff'ffff, 0x7fff'ffff'ffff'ffff});
-  }
-  LIBC_INLINE static constexpr Int<128> min() {
-    return Int<128>({0, 0x8000'0000'0000'0000});
-  }
+  LIBC_INLINE static constexpr Int<128> max() { return Int<128>::max(); }
+  LIBC_INLINE static constexpr Int<128> min() { return Int<128>::min(); }
   // Meant to match std::numeric_limits interface.
   // NOLINTNEXTLINE(readability-identifier-naming)
   LIBC_INLINE_VAR static constexpr int digits = 128;
@@ -1112,30 +1130,28 @@ has_single_bit(T value) {
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countr_zero(const T &value) {
-  return value.ctz();
+  return multiword::countr_zero(value.val);
 }
 
 // Specialization of cpp::countl_zero ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countl_zero(const T &value) {
-  return value.clz();
+  return multiword::countl_zero(value.val);
 }
 
 // Specialization of cpp::countl_one ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countl_one(T value) {
-  // TODO : Implement a faster version not involving operator~.
-  return cpp::countl_zero<T>(~value);
+  return multiword::countl_one(value.val);
 }
 
 // Specialization of cpp::countr_one ('bit.h') for BigInt.
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 countr_one(T value) {
-  // TODO : Implement a faster version not involving operator~.
-  return cpp::countr_zero<T>(~value);
+  return multiword::countr_one(value.val);
 }
 
 // Specialization of cpp::bit_width ('bit.h') for BigInt.
@@ -1182,65 +1198,59 @@ rotr(T value, int rotate) {
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
 mask_trailing_ones() {
-  static_assert(!T::SIGNED);
-  if (count == 0)
-    return T();
-  constexpr unsigned T_BITS = CHAR_BIT * sizeof(T);
-  static_assert(count <= T_BITS && "Invalid bit index");
-  using word_type = typename T::word_type;
-  T out;
-  constexpr int CHUNK_INDEX_CONTAINING_BIT =
-      static_cast<int>(count / T::WORD_SIZE);
-  int index = 0;
-  for (auto &word : out.val) {
-    if (index < CHUNK_INDEX_CONTAINING_BIT)
-      word = -1;
-    else if (index > CHUNK_INDEX_CONTAINING_BIT)
-      word = 0;
-    else
-      word = mask_trailing_ones<word_type, count % T::WORD_SIZE>();
-    ++index;
-  }
+  static_assert(!T::SIGNED && count <= T::BITS);
+  if (count == T::BITS)
+    return T::all_ones();
+  constexpr size_t QUOTIENT = count / T::WORD_SIZE;
+  constexpr size_t REMAINDER = count % T::WORD_SIZE;
+  T out; // zero initialized
+  for (size_t i = 0; i <= QUOTIENT; ++i)
+    out[i] = i < QUOTIENT
+                 ? -1
+                 : mask_trailing_ones<typename T::word_type, REMAINDER>();
   return out;
 }
 
 // Specialization of mask_leading_ones ('math_extras.h') for BigInt.
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T> mask_leading_ones() {
-  static_assert(!T::SIGNED);
-  if (count == 0)
-    return T();
-  constexpr unsigned T_BITS = CHAR_BIT * sizeof(T);
-  static_assert(count <= T_BITS && "Invalid bit index");
-  using word_type = typename T::word_type;
-  T out;
-  constexpr int CHUNK_INDEX_CONTAINING_BIT =
-      static_cast<int>((T::BITS - count - 1ULL) / T::WORD_SIZE);
-  int index = 0;
-  for (auto &word : out.val) {
-    if (index < CHUNK_INDEX_CONTAINING_BIT)
-      word = 0;
-    else if (index > CHUNK_INDEX_CONTAINING_BIT)
-      word = -1;
-    else
-      word = mask_leading_ones<word_type, count % T::WORD_SIZE>();
-    ++index;
-  }
+  static_assert(!T::SIGNED && count <= T::BITS);
+  if (count == T::BITS)
+    return T::all_ones();
+  constexpr size_t QUOTIENT = (T::BITS - count - 1U) / T::WORD_SIZE;
+  constexpr size_t REMAINDER = count % T::WORD_SIZE;
+  T out; // zero initialized
+  for (size_t i = QUOTIENT; i < T::WORD_COUNT; ++i)
+    out[i] = i > QUOTIENT
+                 ? -1
+                 : mask_leading_ones<typename T::word_type, REMAINDER>();
   return out;
 }
 
+// Specialization of mask_trailing_zeros ('math_extras.h') for BigInt.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
+mask_trailing_zeros() {
+  return mask_leading_ones<T, T::BITS - count>();
+}
+
+// Specialization of mask_leading_zeros ('math_extras.h') for BigInt.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, T>
+mask_leading_zeros() {
+  return mask_trailing_ones<T, T::BITS - count>();
+}
+
 // Specialization of count_zeros ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 count_zeros(T value) {
   return cpp::popcount(~value);
 }
 
 // Specialization of first_leading_zero ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_leading_zero(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countl_one(value) + 1;
@@ -1248,16 +1258,14 @@ first_leading_zero(T value) {
 
 // Specialization of first_leading_one ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_leading_one(T value) {
   return first_leading_zero(~value);
 }
 
 // Specialization of first_trailing_zero ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_trailing_zero(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countr_zero(~value) + 1;
@@ -1265,8 +1273,7 @@ first_trailing_zero(T value) {
 
 // Specialization of first_trailing_one ('math_extras.h') for BigInt.
 template <typename T>
-[[nodiscard]]
-LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<is_big_int_v<T>, int>
 first_trailing_one(T value) {
   return value == cpp::numeric_limits<T>::max() ? 0
                                                 : cpp::countr_zero(value) + 1;
diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
index 1287c3e9a84f..4c59cfd99c2e 100644
--- a/libc/src/__support/float_to_string.h
+++ b/libc/src/__support/float_to_string.h
@@ -689,7 +689,7 @@ template <> class FloatToString<long double> {
 
       wide_int float_as_int = mantissa;
 
-      float_as_int.shift_left(exponent);
+      float_as_int <<= exponent;
       int_block_index = 0;
 
       while (float_as_int > 0) {
@@ -708,10 +708,11 @@ template <> class FloatToString<long double> {
 
       const int SHIFT_AMOUNT = FLOAT_AS_INT_WIDTH + exponent;
       static_assert(EXTRA_INT_WIDTH >= sizeof(long double) * 8);
-      float_as_fixed.shift_left(SHIFT_AMOUNT);
+      float_as_fixed <<= SHIFT_AMOUNT;
 
       // If there are still digits above the decimal point, handle those.
-      if (float_as_fixed.clz() < static_cast<int>(EXTRA_INT_WIDTH)) {
+      if (cpp::countl_zero(float_as_fixed) <
+          static_cast<int>(EXTRA_INT_WIDTH)) {
         UInt<EXTRA_INT_WIDTH> above_decimal_point =
             float_as_fixed >> FLOAT_AS_INT_WIDTH;
 
diff --git a/libc/src/__support/integer_literals.h b/libc/src/__support/integer_literals.h
index de1f88fbd3f3..e99799c3512e 100644
--- a/libc/src/__support/integer_literals.h
+++ b/libc/src/__support/integer_literals.h
@@ -151,12 +151,15 @@ template <size_t N> struct Parser<LIBC_NAMESPACE::UInt<N>> {
 template <typename T>
 LIBC_INLINE constexpr T parse_with_prefix(const char *ptr) {
   using P = Parser<T>;
-  if (ptr[0] == '0' && ptr[1] == 'x')
-    return P::template parse<16>(ptr + 2);
-  else if (ptr[0] == '0' && ptr[1] == 'b')
-    return P::template parse<2>(ptr + 2);
-  else
-    return P::template parse<10>(ptr);
+  if (ptr == nullptr)
+    return T();
+  if (ptr[0] == '0') {
+    if (ptr[1] == 'b')
+      return P::template parse<2>(ptr + 2);
+    if (ptr[1] == 'x')
+      return P::template parse<16>(ptr + 2);
+  }
+  return P::template parse<10>(ptr);
 }
 
 } // namespace internal
@@ -169,6 +172,16 @@ LIBC_INLINE constexpr auto operator""_u256(const char *x) {
   return internal::parse_with_prefix<UInt<256>>(x);
 }
 
+template <typename T> LIBC_INLINE constexpr T parse_bigint(const char *ptr) {
+  if (ptr == nullptr)
+    return T();
+  if (ptr[0] == '-' || ptr[0] == '+') {
+    auto positive = internal::parse_with_prefix<T>(ptr + 1);
+    return ptr[0] == '-' ? -positive : positive;
+  }
+  return internal::parse_with_prefix<T>(ptr);
+}
+
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_INTEGER_LITERALS_H
diff --git a/libc/src/__support/math_extras.h b/libc/src/__support/math_extras.h
index 70a8800b285d..bb6424bfa8e4 100644
--- a/libc/src/__support/math_extras.h
+++ b/libc/src/__support/math_extras.h
@@ -10,9 +10,9 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXTRAS_H
 #define LLVM_LIBC_SRC___SUPPORT_MATH_EXTRAS_H
 
-#include "src/__support/CPP/bit.h"           // countl_one, countr_zero
-#include "src/__support/CPP/limits.h"        // CHAR_BIT, numeric_limits
-#include "src/__support/CPP/type_traits.h"   // is_unsigned_v
+#include "src/__support/CPP/bit.h"         // countl_one, countr_zero
+#include "src/__support/CPP/limits.h"      // CHAR_BIT, numeric_limits
+#include "src/__support/CPP/type_traits.h" // is_unsigned_v, is_constant_evaluated
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 
 namespace LIBC_NAMESPACE {
@@ -32,199 +32,94 @@ mask_trailing_ones() {
 template <typename T, size_t count>
 LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
 mask_leading_ones() {
-  constexpr T MASK(mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>());
-  return T(~MASK); // bitwise NOT performs integer promotion.
+  return T(~mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>());
 }
 
-// Add with carry
-template <typename T> struct SumCarry {
-  T sum;
-  T carry;
-};
-
-// This version is always valid for constexpr.
-template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, SumCarry<T>>
-add_with_carry_const(T a, T b, T carry_in) {
-  T tmp = a + carry_in;
-  T sum = b + tmp;
-  T carry_out = (sum < b) + (tmp < a);
-  return {sum, carry_out};
-}
-
-template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, SumCarry<T>>
-add_with_carry(T a, T b, T carry_in) {
-  return add_with_carry_const<T>(a, b, carry_in);
-}
-
-#if __has_builtin(__builtin_addc)
-// https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned char>
-add_with_carry<unsigned char>(unsigned char a, unsigned char b,
-                              unsigned char carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned char>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned char> result{0, 0};
-    result.sum = __builtin_addcb(a, b, carry_in, &result.carry);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned short>
-add_with_carry<unsigned short>(unsigned short a, unsigned short b,
-                               unsigned short carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned short>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned short> result{0, 0};
-    result.sum = __builtin_addcs(a, b, carry_in, &result.carry);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned int>
-add_with_carry<unsigned int>(unsigned int a, unsigned int b,
-                             unsigned int carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned int>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned int> result{0, 0};
-    result.sum = __builtin_addc(a, b, carry_in, &result.carry);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned long>
-add_with_carry<unsigned long>(unsigned long a, unsigned long b,
-                              unsigned long carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned long>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned long> result{0, 0};
-    result.sum = __builtin_addcl(a, b, carry_in, &result.carry);
-    return result;
-  }
+// Create a bitmask with the count right-most bits set to 0, and all other bits
+// set to 1.  Only unsigned types are allowed.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+mask_trailing_zeros() {
+  return mask_leading_ones<T, CHAR_BIT * sizeof(T) - count>();
 }
 
-template <>
-LIBC_INLINE constexpr SumCarry<unsigned long long>
-add_with_carry<unsigned long long>(unsigned long long a, unsigned long long b,
-                                   unsigned long long carry_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return add_with_carry_const<unsigned long long>(a, b, carry_in);
-  } else {
-    SumCarry<unsigned long long> result{0, 0};
-    result.sum = __builtin_addcll(a, b, carry_in, &result.carry);
-    return result;
-  }
+// Create a bitmask with the count left-most bits set to 0, and all other bits
+// set to 1.  Only unsigned types are allowed.
+template <typename T, size_t count>
+LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+mask_leading_zeros() {
+  return mask_trailing_ones<T, CHAR_BIT * sizeof(T) - count>();
 }
 
-#endif // __has_builtin(__builtin_addc)
-
-// Subtract with borrow
-template <typename T> struct DiffBorrow {
-  T diff;
-  T borrow;
-};
-
-// This version is always valid for constexpr.
+// Returns whether 'a + b' overflows, the result is stored in 'res'.
 template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, DiffBorrow<T>>
-sub_with_borrow_const(T a, T b, T borrow_in) {
-  T tmp = a - b;
-  T diff = tmp - borrow_in;
-  T borrow_out = (diff > tmp) + (tmp > a);
-  return {diff, borrow_out};
+[[nodiscard]] LIBC_INLINE constexpr bool add_overflow(T a, T b, T &res) {
+  return __builtin_add_overflow(a, b, &res);
 }
 
-// This version is not always valid for constepxr because it's overriden below
-// if builtins are available.
+// Returns whether 'a - b' overflows, the result is stored in 'res'.
 template <typename T>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    cpp::is_integral_v<T> && cpp::is_unsigned_v<T>, DiffBorrow<T>>
-sub_with_borrow(T a, T b, T borrow_in) {
-  return sub_with_borrow_const<T>(a, b, borrow_in);
+[[nodiscard]] LIBC_INLINE constexpr bool sub_overflow(T a, T b, T &res) {
+  return __builtin_sub_overflow(a, b, &res);
 }
 
-#if __has_builtin(__builtin_subc)
-// https://clang.llvm.org/docs/LanguageExtensions.html#multiprecision-arithmetic-builtins
-
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned char>
-sub_with_borrow<unsigned char>(unsigned char a, unsigned char b,
-                               unsigned char borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned char>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned char> result{0, 0};
-    result.diff = __builtin_subcb(a, b, borrow_in, &result.borrow);
-    return result;
-  }
-}
+#define RETURN_IF(TYPE, BUILTIN)                                               \
+  if constexpr (cpp::is_same_v<T, TYPE>)                                       \
+    return BUILTIN(a, b, carry_in, carry_out);
 
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned short>
-sub_with_borrow<unsigned short>(unsigned short a, unsigned short b,
-                                unsigned short borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned short>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned short> result{0, 0};
-    result.diff = __builtin_subcs(a, b, borrow_in, &result.borrow);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned int>
-sub_with_borrow<unsigned int>(unsigned int a, unsigned int b,
-                              unsigned int borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned int>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned int> result{0, 0};
-    result.diff = __builtin_subc(a, b, borrow_in, &result.borrow);
-    return result;
-  }
-}
-
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned long>
-sub_with_borrow<unsigned long>(unsigned long a, unsigned long b,
-                               unsigned long borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned long>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned long> result{0, 0};
-    result.diff = __builtin_subcl(a, b, borrow_in, &result.borrow);
-    return result;
+// Returns the result of 'a + b' taking into account 'carry_in'.
+// The carry out is stored in 'carry_out' it not 'nullptr', dropped otherwise.
+// We keep the pass by pointer interface for consistency with the intrinsic.
+template <typename T>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+add_with_carry(T a, T b, T carry_in, T &carry_out) {
+  if constexpr (!cpp::is_constant_evaluated()) {
+#if __has_builtin(__builtin_addcb)
+    RETURN_IF(unsigned char, __builtin_addcb)
+#elif __has_builtin(__builtin_addcs)
+    RETURN_IF(unsigned short, __builtin_addcs)
+#elif __has_builtin(__builtin_addc)
+    RETURN_IF(unsigned int, __builtin_addc)
+#elif __has_builtin(__builtin_addcl)
+    RETURN_IF(unsigned long, __builtin_addcl)
+#elif __has_builtin(__builtin_addcll)
+    RETURN_IF(unsigned long long, __builtin_addcll)
+#endif
   }
+  T sum;
+  T carry1 = add_overflow(a, b, sum);
+  T carry2 = add_overflow(sum, carry_in, sum);
+  carry_out = carry1 | carry2;
+  return sum;
 }
 
-template <>
-LIBC_INLINE constexpr DiffBorrow<unsigned long long>
-sub_with_borrow<unsigned long long>(unsigned long long a, unsigned long long b,
-                                    unsigned long long borrow_in) {
-  if (__builtin_is_constant_evaluated()) {
-    return sub_with_borrow_const<unsigned long long>(a, b, borrow_in);
-  } else {
-    DiffBorrow<unsigned long long> result{0, 0};
-    result.diff = __builtin_subcll(a, b, borrow_in, &result.borrow);
-    return result;
+// Returns the result of 'a - b' taking into account 'carry_in'.
+// The carry out is stored in 'carry_out' it not 'nullptr', dropped otherwise.
+// We keep the pass by pointer interface for consistency with the intrinsic.
+template <typename T>
+[[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, T>
+sub_with_borrow(T a, T b, T carry_in, T &carry_out) {
+  if constexpr (!cpp::is_constant_evaluated()) {
+#if __has_builtin(__builtin_subcb)
+    RETURN_IF(unsigned char, __builtin_subcb)
+#elif __has_builtin(__builtin_subcs)
+    RETURN_IF(unsigned short, __builtin_subcs)
+#elif __has_builtin(__builtin_subc)
+    RETURN_IF(unsigned int, __builtin_subc)
+#elif __has_builtin(__builtin_subcl)
+    RETURN_IF(unsigned long, __builtin_subcl)
+#elif __has_builtin(__builtin_subcll)
+    RETURN_IF(unsigned long long, __builtin_subcll)
+#endif
   }
+  T sub;
+  T carry1 = sub_overflow(a, b, sub);
+  T carry2 = sub_overflow(sub, carry_in, sub);
+  carry_out = carry1 | carry2;
+  return sub;
 }
 
-#endif // __has_builtin(__builtin_subc)
+#undef RETURN_IF
 
 template <typename T>
 [[nodiscard]] LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_unsigned_v<T>, int>
diff --git a/libc/src/__support/number_pair.h b/libc/src/__support/number_pair.h
index ee6667b1299f..2f713fc03520 100644
--- a/libc/src/__support/number_pair.h
+++ b/libc/src/__support/number_pair.h
@@ -20,17 +20,6 @@ template <typename T> struct NumberPair {
   T hi = T(0);
 };
 
-template <typename T>
-cpp::enable_if_t<cpp::is_integral_v<T> && cpp::is_unsigned_v<T>,
-                 NumberPair<T>> constexpr split(T a) {
-  constexpr size_t HALF_BIT_WIDTH = sizeof(T) * 4;
-  constexpr T LOWER_HALF_MASK = (T(1) << HALF_BIT_WIDTH) - T(1);
-  NumberPair<T> result;
-  result.lo = a & LOWER_HALF_MASK;
-  result.hi = a >> HALF_BIT_WIDTH;
-  return result;
-}
-
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_NUMBER_PAIR_H
diff --git a/libc/test/src/__support/integer_literals_test.cpp b/libc/test/src/__support/integer_literals_test.cpp
index 5298cf30156e..cbc906aa7c99 100644
--- a/libc/test/src/__support/integer_literals_test.cpp
+++ b/libc/test/src/__support/integer_literals_test.cpp
@@ -133,3 +133,24 @@ TEST(LlvmLibcIntegerLiteralTest, u256) {
       U256_MAX,
       0xFFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF'FFFFFFFF_u256);
 }
+
+TEST(LlvmLibcIntegerLiteralTest, parse_bigint) {
+  using T = LIBC_NAMESPACE::Int<128>;
+  struct {
+    const char *str;
+    T expected;
+  } constexpr TEST_CASES[] = {
+      {"0", 0}, {"-1", -1}, {"+1", 1}, {"-0xFF", -255}, {"-0b11", -3},
+  };
+  for (auto tc : TEST_CASES) {
+    T actual = LIBC_NAMESPACE::parse_bigint<T>(tc.str);
+    EXPECT_EQ(actual, tc.expected);
+  }
+}
+
+TEST(LlvmLibcIntegerLiteralTest, parse_bigint_invalid) {
+  using T = LIBC_NAMESPACE::Int<128>;
+  const T expected; // default construction
+  EXPECT_EQ(LIBC_NAMESPACE::parse_bigint<T>(nullptr), expected);
+  EXPECT_EQ(LIBC_NAMESPACE::parse_bigint<T>(""), expected);
+}
diff --git a/libc/test/src/__support/math_extras_test.cpp b/libc/test/src/__support/math_extras_test.cpp
index e88b3e1d6b68..401e631ea4ba 100644
--- a/libc/test/src/__support/math_extras_test.cpp
+++ b/libc/test/src/__support/math_extras_test.cpp
@@ -101,4 +101,61 @@ TYPED_TEST(LlvmLibcBitTest, CountZeros, UnsignedTypesNoBigInt) {
     EXPECT_EQ(count_zeros<T>(cpp::numeric_limits<T>::max() >> i), i);
 }
 
+using UnsignedTypes = testing::TypeList<
+#if defined(__SIZEOF_INT128__)
+    __uint128_t,
+#endif
+    unsigned char, unsigned short, unsigned int, unsigned long,
+    unsigned long long>;
+
+TYPED_TEST(LlvmLibcBlockMathExtrasTest, add_overflow, UnsignedTypes) {
+  constexpr T ZERO = cpp::numeric_limits<T>::min();
+  constexpr T ONE(1);
+  constexpr T MAX = cpp::numeric_limits<T>::max();
+  constexpr T BEFORE_MAX = MAX - 1;
+
+  const struct {
+    T lhs;
+    T rhs;
+    T sum;
+    bool carry;
+  } TESTS[] = {
+      {ZERO, ONE, ONE, false},       // 0x00 + 0x01 = 0x01
+      {BEFORE_MAX, ONE, MAX, false}, // 0xFE + 0x01 = 0xFF
+      {MAX, ONE, ZERO, true},        // 0xFF + 0x01 = 0x00 (carry)
+      {MAX, MAX, BEFORE_MAX, true},  // 0xFF + 0xFF = 0xFE (carry)
+  };
+  for (auto tc : TESTS) {
+    T sum;
+    bool carry = add_overflow<T>(tc.lhs, tc.rhs, sum);
+    EXPECT_EQ(sum, tc.sum);
+    EXPECT_EQ(carry, tc.carry);
+  }
+}
+
+TYPED_TEST(LlvmLibcBlockMathExtrasTest, sub_overflow, UnsignedTypes) {
+  constexpr T ZERO = cpp::numeric_limits<T>::min();
+  constexpr T ONE(1);
+  constexpr T MAX = cpp::numeric_limits<T>::max();
+  constexpr T BEFORE_MAX = MAX - 1;
+
+  const struct {
+    T lhs;
+    T rhs;
+    T sub;
+    bool carry;
+  } TESTS[] = {
+      {ONE, ZERO, ONE, false},      // 0x01 - 0x00 = 0x01
+      {MAX, MAX, ZERO, false},      // 0xFF - 0xFF = 0x00
+      {ZERO, ONE, MAX, true},       // 0x00 - 0x01 = 0xFF (carry)
+      {BEFORE_MAX, MAX, MAX, true}, // 0xFE - 0xFF = 0xFF (carry)
+  };
+  for (auto tc : TESTS) {
+    T sub;
+    bool carry = sub_overflow<T>(tc.lhs, tc.rhs, sub);
+    EXPECT_EQ(sub, tc.sub);
+    EXPECT_EQ(carry, tc.carry);
+  }
+}
+
 } // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp
index 5764324ca288..5696e54c73f3 100644
--- a/libc/test/src/__support/uint_test.cpp
+++ b/libc/test/src/__support/uint_test.cpp
@@ -8,6 +8,7 @@
 
 #include "src/__support/CPP/optional.h"
 #include "src/__support/UInt.h"
+#include "src/__support/integer_literals.h"        // parse_unsigned_bigint
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
 
 #include "include/llvm-libc-macros/math-macros.h" // HUGE_VALF, HUGE_VALF
@@ -15,6 +16,195 @@
 
 namespace LIBC_NAMESPACE {
 
+enum Value { ZERO, ONE, TWO, MIN, MAX };
+
+template <typename T> auto create(Value value) {
+  switch (value) {
+  case ZERO:
+    return T(0);
+  case ONE:
+    return T(1);
+  case TWO:
+    return T(2);
+  case MIN:
+    return T::min();
+  case MAX:
+    return T::max();
+  }
+}
+
+using Types = testing::TypeList< //
+#ifdef LIBC_TYPES_HAS_INT64
+    BigInt<64, false, uint64_t>, // 64-bits unsigned (1 x uint64_t)
+    BigInt<64, true, uint64_t>,  // 64-bits   signed (1 x uint64_t)
+#endif
+#ifdef LIBC_TYPES_HAS_INT128
+    BigInt<128, false, __uint128_t>, // 128-bits unsigned (1 x __uint128_t)
+    BigInt<128, true, __uint128_t>,  // 128-bits   signed (1 x __uint128_t)
+#endif
+    BigInt<16, false, uint16_t>, // 16-bits unsigned (1 x uint16_t)
+    BigInt<16, true, uint16_t>,  // 16-bits   signed (1 x uint16_t)
+    BigInt<64, false, uint16_t>, // 64-bits unsigned (4 x uint16_t)
+    BigInt<64, true, uint16_t>   // 64-bits   signed (4 x uint16_t)
+    >;
+
+#define ASSERT_SAME(A, B) ASSERT_TRUE((A) == (B))
+
+TYPED_TEST(LlvmLibcUIntClassTest, Additions, Types) {
+  ASSERT_SAME(create<T>(ZERO) + create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) + create<T>(ZERO), create<T>(ONE));
+  ASSERT_SAME(create<T>(ZERO) + create<T>(ONE), create<T>(ONE));
+  ASSERT_SAME(create<T>(ONE) + create<T>(ONE), create<T>(TWO));
+  // 2's complement addition works for signed and unsigned types.
+  // - unsigned : 0xff + 0x01 = 0x00 (255 + 1 = 0)
+  // -   signed : 0xef + 0x01 = 0xf0 (127 + 1 = -128)
+  ASSERT_SAME(create<T>(MAX) + create<T>(ONE), create<T>(MIN));
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Subtraction, Types) {
+  ASSERT_SAME(create<T>(ZERO) - create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) - create<T>(ONE), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) - create<T>(ZERO), create<T>(ONE));
+  // 2's complement subtraction works for signed and unsigned types.
+  // - unsigned : 0x00 - 0x01 = 0xff (   0 - 1 = 255)
+  // -   signed : 0xf0 - 0x01 = 0xef (-128 - 1 = 127)
+  ASSERT_SAME(create<T>(MIN) - create<T>(ONE), create<T>(MAX));
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Multiplication, Types) {
+  ASSERT_SAME(create<T>(ZERO) * create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ZERO) * create<T>(ONE), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) * create<T>(ZERO), create<T>(ZERO));
+  ASSERT_SAME(create<T>(ONE) * create<T>(ONE), create<T>(ONE));
+  ASSERT_SAME(create<T>(ONE) * create<T>(TWO), create<T>(TWO));
+  ASSERT_SAME(create<T>(TWO) * create<T>(ONE), create<T>(TWO));
+  // - unsigned : 0xff x 0xff = 0x01 (mod 0xff)
+  // -   signed : 0xef x 0xef = 0x01 (mod 0xff)
+  ASSERT_SAME(create<T>(MAX) * create<T>(MAX), create<T>(ONE));
+}
+
+template <typename T> void print(const char *msg, T value) {
+  testing::tlog << msg;
+  IntegerToString<T, radix::Hex> buffer(value);
+  testing::tlog << buffer.view() << "\n";
+}
+
+TEST(LlvmLibcUIntClassTest, SignedAddSub) {
+  // Computations performed by https://www.wolframalpha.com/
+  using T = BigInt<128, true, uint32_t>;
+  const T a = parse_bigint<T>("1927508279017230597");
+  const T b = parse_bigint<T>("278789278723478925");
+  const T s = parse_bigint<T>("2206297557740709522");
+  // Addition
+  ASSERT_SAME(a + b, s);
+  ASSERT_SAME(b + a, s); // commutative
+  // Subtraction
+  ASSERT_SAME(a - s, -b);
+  ASSERT_SAME(s - a, b);
+}
+
+TEST(LlvmLibcUIntClassTest, SignedMulDiv) {
+  // Computations performed by https://www.wolframalpha.com/
+  using T = BigInt<128, true, uint16_t>;
+  struct {
+    const char *a;
+    const char *b;
+    const char *mul;
+  } const test_cases[] = {{"-4", "3", "-12"},
+                          {"-3", "-3", "9"},
+                          {"1927508279017230597", "278789278723478925",
+                           "537368642840747885329125014794668225"}};
+  for (auto tc : test_cases) {
+    const T a = parse_bigint<T>(tc.a);
+    const T b = parse_bigint<T>(tc.b);
+    const T mul = parse_bigint<T>(tc.mul);
+    // Multiplication
+    ASSERT_SAME(a * b, mul);
+    ASSERT_SAME(b * a, mul);   // commutative
+    ASSERT_SAME(a * -b, -mul); // sign
+    ASSERT_SAME(-a * b, -mul); // sign
+    ASSERT_SAME(-a * -b, mul); // sign
+    // Division
+    ASSERT_SAME(mul / a, b);
+    ASSERT_SAME(mul / b, a);
+    ASSERT_SAME(-mul / a, -b); // sign
+    ASSERT_SAME(mul / -a, -b); // sign
+    ASSERT_SAME(-mul / -a, b); // sign
+  }
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Division, Types) {
+  ASSERT_SAME(create<T>(ZERO) / create<T>(ONE), create<T>(ZERO));
+  ASSERT_SAME(create<T>(MAX) / create<T>(ONE), create<T>(MAX));
+  ASSERT_SAME(create<T>(MAX) / create<T>(MAX), create<T>(ONE));
+  ASSERT_SAME(create<T>(ONE) / create<T>(ONE), create<T>(ONE));
+  if constexpr (T::SIGNED) {
+    // Special case found by fuzzing.
+    ASSERT_SAME(create<T>(MIN) / create<T>(MIN), create<T>(ONE));
+  }
+  // - unsigned : 0xff / 0x02 = 0x7f
+  // -   signed : 0xef / 0x02 = 0x77
+  ASSERT_SAME(create<T>(MAX) / create<T>(TWO), (create<T>(MAX) >> 1));
+
+  using word_type = typename T::word_type;
+  const T zero_one_repeated = T::all_ones() / T(0xff);
+  const word_type pattern = word_type(~0) / word_type(0xff);
+  for (const word_type part : zero_one_repeated.val) {
+    if constexpr (T::SIGNED == false) {
+      EXPECT_EQ(part, pattern);
+    }
+  }
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, is_neg, Types) {
+  EXPECT_FALSE(create<T>(ZERO).is_neg());
+  EXPECT_FALSE(create<T>(ONE).is_neg());
+  EXPECT_FALSE(create<T>(TWO).is_neg());
+  EXPECT_EQ(create<T>(MIN).is_neg(), T::SIGNED);
+  EXPECT_FALSE(create<T>(MAX).is_neg());
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, Masks, Types) {
+  if constexpr (!T::SIGNED) {
+    constexpr size_t BITS = T::BITS;
+    // mask_trailing_ones
+    ASSERT_SAME((mask_trailing_ones<T, 0>()), T::zero());
+    ASSERT_SAME((mask_trailing_ones<T, 1>()), T::one());
+    ASSERT_SAME((mask_trailing_ones<T, BITS - 1>()), T::all_ones() >> 1);
+    ASSERT_SAME((mask_trailing_ones<T, BITS>()), T::all_ones());
+    // mask_leading_ones
+    ASSERT_SAME((mask_leading_ones<T, 0>()), T::zero());
+    ASSERT_SAME((mask_leading_ones<T, 1>()), T::one() << (BITS - 1));
+    ASSERT_SAME((mask_leading_ones<T, BITS - 1>()), T::all_ones() - T::one());
+    ASSERT_SAME((mask_leading_ones<T, BITS>()), T::all_ones());
+    // mask_trailing_zeros
+    ASSERT_SAME((mask_trailing_zeros<T, 0>()), T::all_ones());
+    ASSERT_SAME((mask_trailing_zeros<T, 1>()), T::all_ones() - T::one());
+    ASSERT_SAME((mask_trailing_zeros<T, BITS - 1>()), T::one() << (BITS - 1));
+    ASSERT_SAME((mask_trailing_zeros<T, BITS>()), T::zero());
+    // mask_trailing_zeros
+    ASSERT_SAME((mask_leading_zeros<T, 0>()), T::all_ones());
+    ASSERT_SAME((mask_leading_zeros<T, 1>()), T::all_ones() >> 1);
+    ASSERT_SAME((mask_leading_zeros<T, BITS - 1>()), T::one());
+    ASSERT_SAME((mask_leading_zeros<T, BITS>()), T::zero());
+  }
+}
+
+TYPED_TEST(LlvmLibcUIntClassTest, CountBits, Types) {
+  if constexpr (!T::SIGNED) {
+    for (size_t i = 0; i <= T::BITS; ++i) {
+      const auto l_one = T::all_ones() << i; // 0b111...000
+      const auto r_one = T::all_ones() >> i; // 0b000...111
+      const int zeros = i;
+      const int ones = T::BITS - zeros;
+      ASSERT_EQ(cpp::countr_one(r_one), ones);
+      ASSERT_EQ(cpp::countl_one(l_one), ones);
+      ASSERT_EQ(cpp::countr_zero(l_one), zeros);
+      ASSERT_EQ(cpp::countl_zero(r_one), zeros);
+    }
+  }
+}
+
 using LL_UInt64 = UInt<64>;
 // We want to test UInt<128> explicitly. So, for
 // convenience, we use a sugar which does not conflict with the UInt128 type
@@ -561,7 +751,7 @@ TEST(LlvmLibcUIntClassTest, FullMulTests) {
     LL_UInt##Bits a = ~LL_UInt##Bits(0);                                       \
     LL_UInt##Bits hi = a.quick_mul_hi(a);                                      \
     LL_UInt##Bits trunc = static_cast<LL_UInt##Bits>(a.ful_mul(a) >> Bits);    \
-    uint64_t overflow = trunc.sub(hi);                                         \
+    uint64_t overflow = trunc.sub_overflow(hi);                                \
     EXPECT_EQ(overflow, uint64_t(0));                                          \
     EXPECT_LE(uint64_t(trunc), uint64_t(Error));                               \
   } while (0)
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
index 4f976122967c..c0d402a89ea3 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
@@ -87,6 +87,7 @@ libc_test(
     srcs = ["uint_test.cpp"],
     deps = [
         "//libc:__support_cpp_optional",
+        "//libc:__support_integer_literals",
         "//libc:__support_macros_properties_types",
         "//libc:__support_uint",
         "//libc:llvm_libc_macros_math_macros",
-- 
cgit v1.2.3


From 5334b31e7c41174a418afbfe132bd0a86a47a22e Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Thu, 4 Apr 2024 10:46:45 +0100
Subject: [mlir][OpenMP][NFC] Use SmallVectorImpl for function arguments
 (#86978)

---
 .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp    | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 646d0ed73084..cacf2c37e38d 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -804,13 +804,13 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
 
 /// Allocate space for privatized reduction variables.
 template <typename T>
-static void
-allocByValReductionVars(T loop, llvm::IRBuilderBase &builder,
-                        LLVM::ModuleTranslation &moduleTranslation,
-                        llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
-                        SmallVector<omp::DeclareReductionOp> &reductionDecls,
-                        SmallVector<llvm::Value *> &privateReductionVariables,
-                        DenseMap<Value, llvm::Value *> &reductionVariableMap) {
+static void allocByValReductionVars(
+    T loop, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation,
+    llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+    SmallVectorImpl<llvm::Value *> &privateReductionVariables,
+    DenseMap<Value, llvm::Value *> &reductionVariableMap) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.restoreIP(allocaIP);
   auto args =
-- 
cgit v1.2.3


From ed5fe66370cb0ea88913458d71959407dc7b1394 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 4 Apr 2024 10:53:36 +0100
Subject: [RemoveDIs][BC] Reject intrinsic->record upgrades for old-format
 modules (#87494)

Fixes issue noted at: https://github.com/llvm/llvm-project/pull/86274

When loading bitcode lazily, we may request debug intrinsics be upgraded
to debug records during the module parsing phase; later on we perform
this upgrade when materializing the module functions. If we change the
module's debug info format between parsing and materializing however,
then the requested upgrade is no longer correct and leads to an
assertion. This patch fixes the issue by adding an extra check in the
autoupgrader to see if the upgrade is no longer suitable, and either
exit-out or fall back to the correct intrinsic->intrinsic upgrade if one
is required.
---
 llvm/include/llvm/IR/AutoUpgrade.h |  3 ++-
 llvm/lib/IR/AutoUpgrade.cpp        | 39 ++++++++++++++++++++++++++++----------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index 152f781ffa9b..97c3e4d7589d 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -36,7 +36,8 @@ namespace llvm {
   /// for upgrading, and returns true if it requires upgrading. It may return
   /// null in NewFn if the all calls to the original intrinsic function
   /// should be transformed to non-function-call instructions.
-  bool UpgradeIntrinsicFunction(Function *F, Function *&NewFn);
+  bool UpgradeIntrinsicFunction(Function *F, Function *&NewFn,
+                                bool CanUpgradeDebugIntrinsicsToRecords = true);
 
   /// This is the complement to the above, replacing a specific call to an
   /// intrinsic function with a call to the specified new function.
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index a44f6af4162f..0f8c984d5e3c 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -983,7 +983,8 @@ static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
   return Intrinsic::not_intrinsic;
 }
 
-static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
+static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
+                                      bool CanUpgradeDebugIntrinsicsToRecords) {
   assert(F && "Illegal to upgrade a non-existent Function.");
 
   StringRef Name = F->getName();
@@ -1057,7 +1058,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   case 'd':
     if (Name.consume_front("dbg.")) {
       // Mark debug intrinsics for upgrade to new debug format.
-      if (F->getParent()->IsNewDbgInfoFormat) {
+      if (CanUpgradeDebugIntrinsicsToRecords &&
+          F->getParent()->IsNewDbgInfoFormat) {
         if (Name == "addr" || Name == "value" || Name == "assign" ||
             Name == "declare" || Name == "label") {
           // There's no function to replace these with.
@@ -1413,9 +1415,11 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   return false;
 }
 
-bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
+bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn,
+                                    bool CanUpgradeDebugIntrinsicsToRecords) {
   NewFn = nullptr;
-  bool Upgraded = upgradeIntrinsicFunction1(F, NewFn);
+  bool Upgraded =
+      upgradeIntrinsicFunction1(F, NewFn, CanUpgradeDebugIntrinsicsToRecords);
   assert(F != NewFn && "Intrinsic function upgraded to the same function");
 
   // Upgrade intrinsic attributes.  This does not change the function.
@@ -2412,6 +2416,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   Builder.SetInsertPoint(CI->getParent(), CI->getIterator());
 
   if (!NewFn) {
+    bool FallthroughToDefaultUpgrade = false;
     // Get the Function's name.
     StringRef Name = F->getName();
 
@@ -4262,16 +4267,30 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = upgradeARMIntrinsicCall(Name, CI, F, Builder);
     } else if (IsAMDGCN) {
       Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
-    } else if (IsDbg && CI->getModule()->IsNewDbgInfoFormat) {
-      upgradeDbgIntrinsicToDbgRecord(Name, CI);
+    } else if (IsDbg) {
+      // We might have decided we don't want the new format after all between
+      // first requesting the upgrade and now; skip the conversion if that is
+      // the case, and check here to see if the intrinsic needs to be upgraded
+      // normally.
+      if (!CI->getModule()->IsNewDbgInfoFormat) {
+        bool NeedsUpgrade =
+            upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false);
+        if (!NeedsUpgrade)
+          return;
+        FallthroughToDefaultUpgrade = true;
+      } else {
+        upgradeDbgIntrinsicToDbgRecord(Name, CI);
+      }
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
 
-    if (Rep)
-      CI->replaceAllUsesWith(Rep);
-    CI->eraseFromParent();
-    return;
+    if (!FallthroughToDefaultUpgrade) {
+      if (Rep)
+        CI->replaceAllUsesWith(Rep);
+      CI->eraseFromParent();
+      return;
+    }
   }
 
   const auto &DefaultCase = [&]() -> void {
-- 
cgit v1.2.3


From 918542d2ce550c0ac3c7e4d753318265f277631b Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ram.ramachandra@arm.com>
Date: Thu, 4 Apr 2024 10:55:24 +0100
Subject: ELFRelocs/AArch64: update canonical reference URL (NFC) (#86955)

Update the URL of the reference to be used for AArch64.def, and add some
comments. The canonical aaelf64 document can be found at:

  https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst
---
 .../llvm/BinaryFormat/ELFRelocs/AArch64.def        | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
index 5fb3fa4aeb7b..cb05db85e2b5 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
@@ -1,18 +1,19 @@
-
 #ifndef ELF_RELOC
 #error "ELF_RELOC must be defined"
 #endif
 
-// Based on ABI release 1.1-beta, dated 6 November 2013. NB: The cover page of
-// this document, IHI0056C_beta_aaelf64.pdf, on infocenter.arm.com, still
-// labels this as release 1.0.
+// Based on released ABI: https://github.com/ARM-software/abi-aa, aaelf64.
+// ELF64
+// Null relocation: also 0x100 for ELF64
 ELF_RELOC(R_AARCH64_NONE,                                0)
+// Data relocations
 ELF_RELOC(R_AARCH64_ABS64,                           0x101)
 ELF_RELOC(R_AARCH64_ABS32,                           0x102)
 ELF_RELOC(R_AARCH64_ABS16,                           0x103)
 ELF_RELOC(R_AARCH64_PREL64,                          0x104)
 ELF_RELOC(R_AARCH64_PREL32,                          0x105)
 ELF_RELOC(R_AARCH64_PREL16,                          0x106)
+// Static AArch64 relocations
 ELF_RELOC(R_AARCH64_MOVW_UABS_G0,                    0x107)
 ELF_RELOC(R_AARCH64_MOVW_UABS_G0_NC,                 0x108)
 ELF_RELOC(R_AARCH64_MOVW_UABS_G1,                    0x109)
@@ -60,11 +61,13 @@ ELF_RELOC(R_AARCH64_LD64_GOT_LO12_NC,                0x138)
 ELF_RELOC(R_AARCH64_LD64_GOTPAGE_LO15,               0x139)
 ELF_RELOC(R_AARCH64_PLT32,                           0x13a)
 ELF_RELOC(R_AARCH64_GOTPCREL32,                      0x13b)
+// General dynamic TLS relocations
 ELF_RELOC(R_AARCH64_TLSGD_ADR_PREL21,                0x200)
 ELF_RELOC(R_AARCH64_TLSGD_ADR_PAGE21,                0x201)
 ELF_RELOC(R_AARCH64_TLSGD_ADD_LO12_NC,               0x202)
 ELF_RELOC(R_AARCH64_TLSGD_MOVW_G1,                   0x203)
 ELF_RELOC(R_AARCH64_TLSGD_MOVW_G0_NC,                0x204)
+// Local dynamic TLS relocations
 ELF_RELOC(R_AARCH64_TLSLD_ADR_PREL21,                0x205)
 ELF_RELOC(R_AARCH64_TLSLD_ADR_PAGE21,                0x206)
 ELF_RELOC(R_AARCH64_TLSLD_ADD_LO12_NC,               0x207)
@@ -92,6 +95,7 @@ ELF_RELOC(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC,       0x21c)
 ELF_RELOC(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21,       0x21d)
 ELF_RELOC(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC,     0x21e)
 ELF_RELOC(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19,        0x21f)
+// Local exec TLS relocations
 ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G2,             0x220)
 ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G1,             0x221)
 ELF_RELOC(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC,          0x222)
@@ -108,6 +112,7 @@ ELF_RELOC(R_AARCH64_TLSLE_LDST32_TPREL_LO12,         0x22c)
 ELF_RELOC(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC,      0x22d)
 ELF_RELOC(R_AARCH64_TLSLE_LDST64_TPREL_LO12,         0x22e)
 ELF_RELOC(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC,      0x22f)
+// TLS descriptor relocations
 ELF_RELOC(R_AARCH64_TLSDESC_LD_PREL19,               0x230)
 ELF_RELOC(R_AARCH64_TLSDESC_ADR_PREL21,              0x231)
 ELF_RELOC(R_AARCH64_TLSDESC_ADR_PAGE21,              0x232)
@@ -122,8 +127,7 @@ ELF_RELOC(R_AARCH64_TLSLE_LDST128_TPREL_LO12,        0x23a)
 ELF_RELOC(R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC,     0x23b)
 ELF_RELOC(R_AARCH64_TLSLD_LDST128_DTPREL_LO12,       0x23c)
 ELF_RELOC(R_AARCH64_TLSLD_LDST128_DTPREL_LO12_NC,    0x23d)
-ELF_RELOC(R_AARCH64_AUTH_ABS64,                      0x244)
-// Dynamic relocations start
+// Dynamic relocations
 ELF_RELOC(R_AARCH64_COPY,                            0x400)
 ELF_RELOC(R_AARCH64_GLOB_DAT,                        0x401)
 ELF_RELOC(R_AARCH64_JUMP_SLOT,                       0x402)
@@ -136,8 +140,12 @@ ELF_RELOC(R_AARCH64_TLS_DTPREL64,                    0x405)
 ELF_RELOC(R_AARCH64_TLS_TPREL64,                     0x406)
 ELF_RELOC(R_AARCH64_TLSDESC,                         0x407)
 ELF_RELOC(R_AARCH64_IRELATIVE,                       0x408)
+// PAuthABI static and dynamic relocations: defined in pauthabielf64,
+// https://github.com/ARM-software/abi-aa
+ELF_RELOC(R_AARCH64_AUTH_ABS64,                      0x244)
 ELF_RELOC(R_AARCH64_AUTH_RELATIVE,                   0x411)
 
+// ELF32
 // ELF_RELOC(R_AARCH64_P32_NONE,                         0)
 ELF_RELOC(R_AARCH64_P32_ABS32,                       0x001)
 ELF_RELOC(R_AARCH64_P32_ABS16,                       0x002)
@@ -216,7 +224,7 @@ ELF_RELOC(R_AARCH64_P32_TLSDESC_ADR_PAGE21,          0x07c)
 ELF_RELOC(R_AARCH64_P32_TLSDESC_LD32_LO12,           0x07d)
 ELF_RELOC(R_AARCH64_P32_TLSDESC_ADD_LO12,            0x07e)
 ELF_RELOC(R_AARCH64_P32_TLSDESC_CALL,                0x07f)
-// Dynamic relocations start
+// Dynamic relocations
 ELF_RELOC(R_AARCH64_P32_COPY,                        0x0b4)
 ELF_RELOC(R_AARCH64_P32_GLOB_DAT,                    0x0b5)
 ELF_RELOC(R_AARCH64_P32_JUMP_SLOT,                   0x0b6)
-- 
cgit v1.2.3


From 099ecdf1ec2f87b5bae74518166daf1d2b09da45 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Thu, 4 Apr 2024 10:55:42 +0100
Subject: [mlir][OpenMP] map argument to reduction initialization region
 (#86979)

The argument to the initialization region of reduction declarations was
never mapped. This meant that if this argument was accessed inside the
initialization region, that mlir operation would be translated to an
llvm operation with a null argument (failing verification).

Adding the mapping ensures that the right LLVM value can be found when
inlining and converting the initialization region.

We have to separately establish and clean up these mappings for each use
of the reduction declaration because repeated usage of the same
declaration will inline it using a different concrete value for the
block argument.

This argument was never used previously because for most cases the
initialized value depends only upon the type of the reduction, not on
the original variable. It is needed now so that we can read the array
extents for the local copy from the mold.

Flang support for reductions on assumed shape arrays patch 2/3
---
 .../Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp   |  35 +++++++
 .../Target/LLVMIR/openmp-reduction-init-arg.mlir   | 111 +++++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index cacf2c37e38d..c4bf6a20ebe7 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -825,6 +825,25 @@ static void allocByValReductionVars(
   }
 }
 
+/// Map input argument to all reduction initialization regions
+template <typename T>
+static void
+mapInitializationArg(T loop, LLVM::ModuleTranslation &moduleTranslation,
+                     SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+                     unsigned i) {
+  // map input argument to the initialization region
+  mlir::omp::DeclareReductionOp &reduction = reductionDecls[i];
+  Region &initializerRegion = reduction.getInitializerRegion();
+  Block &entry = initializerRegion.front();
+  assert(entry.getNumArguments() == 1 &&
+         "the initialization region has one argument");
+
+  mlir::Value mlirSource = loop.getReductionVars()[i];
+  llvm::Value *llvmSource = moduleTranslation.lookupValue(mlirSource);
+  assert(llvmSource && "lookup reduction var");
+  moduleTranslation.mapValue(entry.getArgument(0), llvmSource);
+}
+
 /// Collect reduction info
 template <typename T>
 static void collectReductionInfo(
@@ -902,6 +921,10 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
       loop.getRegion().getArguments().take_back(loop.getNumReductionVars());
   for (unsigned i = 0; i < loop.getNumReductionVars(); ++i) {
     SmallVector<llvm::Value *> phis;
+
+    // map block argument to initializer region
+    mapInitializationArg(loop, moduleTranslation, reductionDecls, i);
+
     if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
                                        "omp.reduction.neutral", builder,
                                        moduleTranslation, &phis)))
@@ -925,6 +948,11 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
       builder.CreateStore(phis[0], privateReductionVariables[i]);
       // the rest was handled in allocByValReductionVars
     }
+
+    // forget the mapping for the initializer region because we might need a
+    // different mapping if this reduction declaration is re-used for a
+    // different variable
+    moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion());
   }
 
   // Store the mapping between reduction variables and their private copies on
@@ -1118,6 +1146,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
             opInst.getNumReductionVars());
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
+
+      // map the block argument
+      mapInitializationArg(opInst, moduleTranslation, reductionDecls, i);
       if (failed(inlineConvertOmpRegions(
               reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral",
               builder, moduleTranslation, &phis)))
@@ -1144,6 +1175,10 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
         builder.CreateStore(phis[0], privateReductionVariables[i]);
         // the rest is done in allocByValReductionVars
       }
+
+      // clear block argument mapping in case it needs to be re-created with a
+      // different source for another use of the same reduction decl
+      moduleTranslation.forgetMapping(reductionDecls[i].getInitializerRegion());
     }
 
     // Store the mapping between reduction variables and their private copies on
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
new file mode 100644
index 000000000000..5dd31c425566
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-init-arg.mlir
@@ -0,0 +1,111 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Test that the block argument to the initialization region of
+// omp.declare_reduction gets mapped properly when translating to LLVMIR.
+
+module {
+  omp.declare_reduction @add_reduction_byref_box_Uxf64 : !llvm.ptr init {
+  ^bb0(%arg0: !llvm.ptr):
+// test usage of %arg0:
+    %11 = llvm.load %arg0 : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    omp.yield(%arg0 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    omp.yield(%arg0 : !llvm.ptr)
+  }
+
+  llvm.func internal @_QFPreduce(%arg0: !llvm.ptr {fir.bindc_name = "r"}, %arg1: !llvm.ptr {fir.bindc_name = "r2"}) attributes {sym_visibility = "private"} {
+  %8 = llvm.mlir.constant(1 : i32) : i32
+  %9 = llvm.mlir.constant(10 : i32) : i32
+  %10 = llvm.mlir.constant(0 : i32) : i32
+  %83 = llvm.mlir.constant(1 : i64) : i64
+  %84 = llvm.alloca %83 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+  %86 = llvm.mlir.constant(1 : i64) : i64
+  %87 = llvm.alloca %86 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+// test multiple reduction variables to ensure they don't intefere with eachother
+// when inlining the reduction init region multiple times
+    omp.parallel byref reduction(@add_reduction_byref_box_Uxf64 %84 -> %arg3 : !llvm.ptr, @add_reduction_byref_box_Uxf64 %87 -> %arg4 : !llvm.ptr) {
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK-LABEL: define internal void @_QFPreduce
+// CHECK:         %[[VAL_0:.*]] = alloca { ptr, ptr }, align 8
+// CHECK:         %[[VAL_1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK:         %[[VAL_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK:         br label %[[VAL_3:.*]]
+// CHECK:       entry:                                            ; preds = %[[VAL_4:.*]]
+// CHECK:         %[[VAL_5:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         br label %[[VAL_6:.*]]
+// CHECK:       omp_parallel:                                     ; preds = %[[VAL_3]]
+// CHECK:         %[[VAL_7:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_0]], i32 0, i32 0
+// CHECK:         store ptr %[[VAL_1]], ptr %[[VAL_7]], align 8
+// CHECK:         %[[VAL_8:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_0]], i32 0, i32 1
+// CHECK:         store ptr %[[VAL_2]], ptr %[[VAL_8]], align 8
+// CHECK:         call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @_QFPreduce..omp_par, ptr %[[VAL_0]])
+// CHECK:         br label %[[VAL_9:.*]]
+// CHECK:       omp.par.outlined.exit:                            ; preds = %[[VAL_6]]
+// CHECK:         br label %[[VAL_10:.*]]
+// CHECK:       omp.par.exit.split:                               ; preds = %[[VAL_9]]
+// CHECK:         ret void
+// CHECK:       omp.par.entry:
+// CHECK:         %[[VAL_11:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_12:.*]], i32 0, i32 0
+// CHECK:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8
+// CHECK:         %[[VAL_14:.*]] = getelementptr { ptr, ptr }, ptr %[[VAL_12]], i32 0, i32 1
+// CHECK:         %[[VAL_15:.*]] = load ptr, ptr %[[VAL_14]], align 8
+// CHECK:         %[[VAL_16:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_17:.*]] = load i32, ptr %[[VAL_18:.*]], align 4
+// CHECK:         store i32 %[[VAL_17]], ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_19:.*]] = load i32, ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_20:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_13]], align 8
+// CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
+// CHECK:         store ptr %[[VAL_13]], ptr %[[VAL_21]], align 8
+// CHECK:         %[[VAL_22:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[VAL_15]], align 8
+// CHECK:         %[[VAL_23:.*]] = alloca ptr, align 8
+// CHECK:         store ptr %[[VAL_15]], ptr %[[VAL_23]], align 8
+// CHECK:         %[[VAL_24:.*]] = alloca [2 x ptr], align 8
+// CHECK:         br label %[[VAL_25:.*]]
+// CHECK:       omp.par.region:                                   ; preds = %[[VAL_26:.*]]
+// CHECK:         br label %[[VAL_27:.*]]
+// CHECK:       omp.par.region1:                                  ; preds = %[[VAL_25]]
+// CHECK:         br label %[[VAL_28:.*]]
+// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_27]]
+// CHECK:         %[[VAL_29:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_24]], i64 0, i64 0
+// CHECK:         store ptr %[[VAL_21]], ptr %[[VAL_29]], align 8
+// CHECK:         %[[VAL_30:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_24]], i64 0, i64 1
+// CHECK:         store ptr %[[VAL_23]], ptr %[[VAL_30]], align 8
+// CHECK:         %[[VAL_31:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_32:.*]] = call i32 @__kmpc_reduce(ptr @1, i32 %[[VAL_31]], i32 2, i64 16, ptr %[[VAL_24]], ptr @.omp.reduction.func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         switch i32 %[[VAL_32]], label %[[VAL_33:.*]] [
+// CHECK:           i32 1, label %[[VAL_34:.*]]
+// CHECK:           i32 2, label %[[VAL_35:.*]]
+// CHECK:         ]
+// CHECK:       reduce.switch.atomic:                             ; preds = %[[VAL_28]]
+// CHECK:         unreachable
+// CHECK:       reduce.switch.nonatomic:                          ; preds = %[[VAL_28]]
+// CHECK:         %[[VAL_36:.*]] = load ptr, ptr %[[VAL_21]], align 8
+// CHECK:         %[[VAL_37:.*]] = load ptr, ptr %[[VAL_23]], align 8
+// CHECK:         call void @__kmpc_end_reduce(ptr @1, i32 %[[VAL_31]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK:         br label %[[VAL_33]]
+// CHECK:       reduce.finalize:                                  ; preds = %[[VAL_34]], %[[VAL_28]]
+// CHECK:         br label %[[VAL_38:.*]]
+// CHECK:       omp.par.pre_finalize:                             ; preds = %[[VAL_33]]
+// CHECK:         br label %[[VAL_39:.*]]
+// CHECK:       omp.par.outlined.exit.exitStub:                   ; preds = %[[VAL_38]]
+// CHECK:         ret void
+// CHECK:         %[[VAL_40:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_41:.*]], i64 0, i64 0
+// CHECK:         %[[VAL_42:.*]] = load ptr, ptr %[[VAL_40]], align 8
+// CHECK:         %[[VAL_43:.*]] = load ptr, ptr %[[VAL_42]], align 8
+// CHECK:         %[[VAL_44:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_45:.*]], i64 0, i64 0
+// CHECK:         %[[VAL_46:.*]] = load ptr, ptr %[[VAL_44]], align 8
+// CHECK:         %[[VAL_47:.*]] = load ptr, ptr %[[VAL_46]], align 8
+// CHECK:         %[[VAL_48:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_41]], i64 0, i64 1
+// CHECK:         %[[VAL_49:.*]] = load ptr, ptr %[[VAL_48]], align 8
+// CHECK:         %[[VAL_50:.*]] = load ptr, ptr %[[VAL_49]], align 8
+// CHECK:         %[[VAL_51:.*]] = getelementptr inbounds [2 x ptr], ptr %[[VAL_45]], i64 0, i64 1
+// CHECK:         %[[VAL_52:.*]] = load ptr, ptr %[[VAL_51]], align 8
+// CHECK:         %[[VAL_53:.*]] = load ptr, ptr %[[VAL_52]], align 8
+// CHECK:         ret void
+
-- 
cgit v1.2.3


From a9d963fdf81dc5c6221a0e915821877c35096aff Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 11:01:52 +0100
Subject: [DAG] SoftenFloatResult - add clang-format off/on tags around switch
 statement. NFC.

Stop clang-format from trying to put all the case on separate lines.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 3332c02ec723..a8b1f41ee40d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -53,6 +53,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   SDValue R = SDValue();
 
   switch (N->getOpcode()) {
+    // clang-format off
   default:
 #ifndef NDEBUG
     dbgs() << "SoftenFloatResult #" << ResNo << ": ";
@@ -115,9 +116,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FPOWI:
     case ISD::FLDEXP:
     case ISD::STRICT_FLDEXP: R = SoftenFloatRes_ExpOp(N); break;
-    case ISD::FFREXP:
-      R = SoftenFloatRes_FFREXP(N);
-      break;
+    case ISD::FFREXP:        R = SoftenFloatRes_FFREXP(N); break;
     case ISD::STRICT_FREM:
     case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
     case ISD::STRICT_FRINT:
@@ -150,14 +149,11 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::VECREDUCE_FMIN:
     case ISD::VECREDUCE_FMAX:
     case ISD::VECREDUCE_FMAXIMUM:
-    case ISD::VECREDUCE_FMINIMUM:
-      R = SoftenFloatRes_VECREDUCE(N);
-      break;
+    case ISD::VECREDUCE_FMINIMUM: R = SoftenFloatRes_VECREDUCE(N); break;
     case ISD::VECREDUCE_SEQ_FADD:
-    case ISD::VECREDUCE_SEQ_FMUL:
-      R = SoftenFloatRes_VECREDUCE_SEQ(N);
-      break;
-  }
+    case ISD::VECREDUCE_SEQ_FMUL: R = SoftenFloatRes_VECREDUCE_SEQ(N); break;
+      // clang-format on
+    }
 
   // If R is null, the sub-method took care of registering the result.
   if (R.getNode()) {
-- 
cgit v1.2.3


From d799be8154d6cb9f5a5b0f6a5c41923313731a85 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Thu, 4 Apr 2024 12:06:28 +0200
Subject: [flang] implement passing assumed-size array to assumed-rank
 arguments (#87511)

Remove the TODO, the patch that ensured that the descriptor upper bound
is set to -1 was https://github.com/llvm/llvm-project/pull/79156.
---
 flang/lib/Lower/ConvertCall.cpp               |  9 ---------
 flang/test/Lower/HLFIR/assumed-rank-iface.f90 | 23 +++++++++++++++++------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 6eba243c237c..315a3f6736aa 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -1340,15 +1340,6 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   } else {
     addr = hlfir::genVariableRawAddress(loc, builder, entity);
   }
-  // The last extent created for assumed-rank descriptors must be -1 (18.5.3
-  // point 5.). This should be done when creating the assumed-size shape for
-  // consistency.
-  if (auto baseBoxDummy = mlir::dyn_cast<fir::BaseBoxType>(dummyType))
-    if (baseBoxDummy.isAssumedRank())
-      if (const Fortran::semantics::Symbol *sym =
-              Fortran::evaluate::UnwrapWholeSymbolDataRef(*arg.entity))
-        if (Fortran::semantics::IsAssumedSizeArray(sym->GetUltimate()))
-          TODO(loc, "passing assumed-size to assumed-rank array");
 
   // For ranked actual passed to assumed-rank dummy, the cast to assumed-rank
   // box is inserted when building the fir.call op. Inserting it here would
diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
index 5df794434844..155ce8fb55f2 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-iface.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
@@ -133,9 +133,20 @@ end subroutine
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<*:i32>>
 ! CHECK:           fir.call @_QPint_opt_assumed_rank(%[[VAL_11]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
 
-! TODO: set assumed size last extent to -1.
-!subroutine int_r2_assumed_size_to_assumed_rank(x)
-!  use ifaces, only : int_assumed_rank
-!  integer :: x(10, *)
-!  call int_assumed_rank(x)
-!end subroutine
+subroutine int_r2_assumed_size_to_assumed_rank(x)
+  use ifaces, only : int_assumed_rank
+  integer :: x(10, *)
+  call int_assumed_rank(x)
+end subroutine
+! CHECK-LABEL:   func.func @_QPint_r2_assumed_size_to_assumed_rank(
+! CHECK-SAME:                                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<10x?xi32>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]] = arith.constant 10 : i64
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i64) -> index
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]] = arith.cmpi sgt, %[[VAL_2]], %[[VAL_3]] : index
+! CHECK:           %[[VAL_5:.*]] = arith.select %[[VAL_4]], %[[VAL_2]], %[[VAL_3]] : index
+! CHECK:           %[[VAL_6:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]], %[[VAL_6]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>)
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.box<!fir.array<10x?xi32>>) -> !fir.box<!fir.array<*:i32>>
+! CHECK:           fir.call @_QPint_assumed_rank(%[[VAL_9]]) fastmath<contract> : (!fir.box<!fir.array<*:i32>>) -> ()
-- 
cgit v1.2.3


From cbdc86e46c3824dda152db2bd0b9fdb3872ddf87 Mon Sep 17 00:00:00 2001
From: "Andrew V. Teylu" <andrewvaughanj@gmail.com>
Date: Thu, 4 Apr 2024 11:10:23 +0100
Subject: [clang-repl] Add call to 'InitializeAllAsmParsers' (#86727)

This PR fixes the following issue when working with `clang-repl`:

```
fatal error: error in backend: Inline asm not supported by this streamer because we don't have an asm parser for this target
```

When working with the following input (named "unit.cpp"):

```cpp
__asm(".globl _ZSt21ios_base_library_initv");
int x;
```

and then in `clang-repl`:

```
#include "unit.cpp"
x = 10;
```

Signed-off-by: Andrew V. Teylu <andrew.teylu@vector.com>
---
 clang/test/Interpreter/inline-asm.cpp | 17 +++++++++++++++++
 clang/tools/clang-repl/ClangRepl.cpp  |  1 +
 2 files changed, 18 insertions(+)
 create mode 100644 clang/test/Interpreter/inline-asm.cpp

diff --git a/clang/test/Interpreter/inline-asm.cpp b/clang/test/Interpreter/inline-asm.cpp
new file mode 100644
index 000000000000..f94f14df72f8
--- /dev/null
+++ b/clang/test/Interpreter/inline-asm.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: host-supports-jit, x86_64-linux
+// UNSUPPORTED: system-aix
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: cat %t/inline-asm.txt | clang-repl -Xcc="-I%t"
+
+//--- inline-asm.cpp
+__asm(".globl _ZSt21ios_base_library_initv");
+int x;
+
+//--- inline-asm.txt
+#include "inline-asm.cpp"
+x = 10;
+%quit
diff --git a/clang/tools/clang-repl/ClangRepl.cpp b/clang/tools/clang-repl/ClangRepl.cpp
index 5bad8145324d..aecf61b97fc7 100644
--- a/clang/tools/clang-repl/ClangRepl.cpp
+++ b/clang/tools/clang-repl/ClangRepl.cpp
@@ -152,6 +152,7 @@ int main(int argc, const char **argv) {
   llvm::InitializeAllTargets();
   llvm::InitializeAllTargetMCs();
   llvm::InitializeAllAsmPrinters();
+  llvm::InitializeAllAsmParsers();
 
   if (OptHostSupportsJit) {
     auto J = llvm::orc::LLJITBuilder().create();
-- 
cgit v1.2.3


From 2d0087424f120c97c8fdecf50e6e5f6a1f3969f2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 11:10:55 +0100
Subject: [DAG] Remove extract_vector_elt(freeze(x)), idx ->
 freeze(extract_vector_elt(x), idx) fold (#87480)

Reverse the fold with handling inside canCreateUndefOrPoison for cases where we know that the extract index is in bounds.

This exposed a number or regressions, and required some initial freeze handling of SCALAR_TO_VECTOR, which will require us to properly improve demandedelts support to handle its undef upper elements.

There is still one outstanding regression to be addressed in the future - how do we want to handle folds involving frozen loads?

Fixes #86968
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp       |  6 ------
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp      | 11 +++++++++++
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp    | 21 +++++++++++++++++++++
 llvm/lib/Target/X86/X86ISelLowering.cpp             |  2 ++
 llvm/test/CodeGen/X86/freeze-vector.ll              |  6 ++----
 ...den-load-of-small-alloca-with-zero-upper-half.ll | 18 +++++++++++++-----
 6 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 28fe0697357d..0a473180538a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22265,12 +22265,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
     return DAG.getUNDEF(ScalarVT);
 
-  // extract_vector_elt(freeze(x)), idx -> freeze(extract_vector_elt(x)), idx
-  if (VecOp.hasOneUse() && VecOp.getOpcode() == ISD::FREEZE) {
-    return DAG.getFreeze(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
-                                     VecOp.getOperand(0), Index));
-  }
-
   // extract_vector_elt (build_vector x, y), 1 -> y
   if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
        VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 25b51d5c1e02..1dd0fa49a460 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5149,6 +5149,17 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::OR:
     return ConsiderFlags && Op->getFlags().hasDisjoint();
 
+  case ISD::SCALAR_TO_VECTOR:
+    // Check if we demand any upper (undef) elements.
+    return !PoisonOnly && DemandedElts.ugt(1);
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Ensure that the element index is in bounds.
+    EVT VecVT = Op.getOperand(0).getValueType();
+    KnownBits KnownIdx = computeKnownBits(Op.getOperand(1), Depth + 1);
+    return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
+  }
+
   case ISD::INSERT_VECTOR_ELT:{
     // Ensure that the element index is in bounds.
     EVT VecVT = Op.getOperand(0).getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8bb9541bfe10..5e053f97675d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -742,6 +742,13 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
 
     break;
   }
+  case ISD::FREEZE: {
+    SDValue N0 = Op.getOperand(0);
+    if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
+                                             /*PoisonOnly=*/false))
+      return N0;
+    break;
+  }
   case ISD::AND: {
     LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -3184,6 +3191,20 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     }
     break;
   }
+  case ISD::FREEZE: {
+    SDValue N0 = Op.getOperand(0);
+    if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts,
+                                                 /*PoisonOnly=*/false))
+      return TLO.CombineTo(Op, N0);
+
+    // TODO: Replace this with the general fold from DAGCombiner::visitFREEZE
+    // freeze(op(x, ...)) -> op(freeze(x), ...).
+    if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && DemandedElts == 1)
+      return TLO.CombineTo(
+          Op, TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
+                              TLO.DAG.getFreeze(N0.getOperand(0))));
+    break;
+  }
   case ISD::BUILD_VECTOR: {
     // Check all elements and simplify any unused elements with UNDEF.
     if (!DemandedElts.isAllOnes()) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a9751e170fe6..6f65344215c0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42725,6 +42725,8 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   switch (Op.getOpcode()) {
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
+  case X86ISD::UNPCKH:
+  case X86ISD::UNPCKL:
     return false;
   }
   return TargetLowering::canCreateUndefOrPoisonForTargetNode(
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index d9ee5f0d3e49..ee7f4aea02c0 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -173,16 +173,14 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    vmovdqa (%edx), %xmm0
 ; X86-NEXT:    vpand (%ecx), %xmm0, %xmm0
-; X86-NEXT:    vpextrb $6, %xmm0, %ecx
-; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_extractelement:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa (%rdi), %xmm0
 ; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
-; X64-NEXT:    vpextrb $6, %xmm0, %eax
-; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
   %i0 = load <16 x i8>, ptr %origin0
   %i1 = load <16 x i8>, ptr %origin1
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 691ca40191d4..f7a27a5b9144 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -65,6 +65,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -74,6 +75,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
@@ -81,14 +83,15 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NEXT:    movb %al, (%edx)
+; X86-NO-BMI2-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NEXT:    movb %dl, (%eax)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -97,6 +100,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movb %cl, (%eax)
@@ -119,6 +123,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -128,6 +133,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
@@ -139,6 +145,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %edx
@@ -151,6 +158,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movw %cx, (%eax)
-- 
cgit v1.2.3


From dbd6eb6779bc7073c5466e9689eb5a69736d1120 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Thu, 4 Apr 2024 11:14:28 +0100
Subject: [flang][OpenMP] lower reductions of assumed shape arrays (#86982)

Patch 1: https://github.com/llvm/llvm-project/pull/86978
Patch 2: https://github.com/llvm/llvm-project/pull/86979
---
 flang/lib/Lower/OpenMP/ReductionProcessor.cpp      | 25 +++++-
 .../wsloop-reduction-array-assumed-shape.f90       | 90 ++++++++++++++++++++++
 2 files changed, 112 insertions(+), 3 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90

diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 6a8447a00e4c..c1c94119fd90 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -523,11 +523,16 @@ void ReductionProcessor::addDeclareReduction(
     if (reductionSymbols)
       reductionSymbols->push_back(symbol);
     mlir::Value symVal = converter.getSymbolAddress(*symbol);
-    auto redType = mlir::cast<fir::ReferenceType>(symVal.getType());
+    mlir::Type eleType;
+    auto refType = mlir::dyn_cast_or_null<fir::ReferenceType>(symVal.getType());
+    if (refType)
+      eleType = refType.getEleTy();
+    else
+      eleType = symVal.getType();
 
     // all arrays must be boxed so that we have convenient access to all the
     // information needed to iterate over the array
-    if (mlir::isa<fir::SequenceType>(redType.getEleTy())) {
+    if (mlir::isa<fir::SequenceType>(eleType)) {
       // For Host associated symbols, use `SymbolBox` instead
       Fortran::lower::SymbolBox symBox =
           converter.lookupOneLevelUpSymbol(*symbol);
@@ -542,11 +547,25 @@ void ReductionProcessor::addDeclareReduction(
       builder.create<fir::StoreOp>(currentLocation, box, alloca);
 
       symVal = alloca;
-      redType = mlir::cast<fir::ReferenceType>(symVal.getType());
+    } else if (mlir::isa<fir::BaseBoxType>(symVal.getType())) {
+      // boxed arrays are passed as values not by reference. Unfortunately,
+      // we can't pass a box by value to omp.redution_declare, so turn it
+      // into a reference
+
+      auto alloca =
+          builder.create<fir::AllocaOp>(currentLocation, symVal.getType());
+      builder.create<fir::StoreOp>(currentLocation, symVal, alloca);
+      symVal = alloca;
     } else if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>()) {
       symVal = declOp.getBase();
     }
 
+    // this isn't the same as the by-val and by-ref passing later in the
+    // pipeline. Both styles assume that the variable is a reference at
+    // this point
+    assert(mlir::isa<fir::ReferenceType>(symVal.getType()) &&
+           "reduction input var is a reference");
+
     reductionVars.push_back(symVal);
   }
   const bool isByRef = doReductionByRef(reductionVars);
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
new file mode 100644
index 000000000000..a1f339faea5c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
@@ -0,0 +1,90 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+program reduce_assumed_shape
+real(8), dimension(2) :: r
+r = 0
+call reduce(r)
+print *, r
+
+contains
+subroutine reduce(r)
+  implicit none
+  real(8),intent(inout) :: r(:)
+  integer :: i = 0
+
+  !$omp parallel do reduction(+:r)
+  do i=0,10
+    r(1) = i
+    r(2) = 1
+  enddo
+  !$omp end parallel do
+end subroutine
+end program
+
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref<!fir.box<!fir.array<?xf64>>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf64>>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box<!fir.array<?xf64>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.array<?xf64>, %[[VAL_4]]#1 {bindc_name = ".tmp"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
+! CHECK:           hlfir.assign %[[VAL_1]] to %[[VAL_7]]#0 : f64, !fir.box<!fir.array<?xf64>>
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
+! CHECK:           fir.store %[[VAL_7]]#0 to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf64>>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf64>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<?xf64>>>):
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box<!fir.array<?xf64>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]]#0, %[[VAL_5]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[VAL_7]] to %[[VAL_5]]#1 step %[[VAL_7]] unordered {
+! CHECK:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<?xf64>>, !fir.shapeshift<1>, index) -> !fir.ref<f64>
+! CHECK:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<?xf64>>, !fir.shapeshift<1>, index) -> !fir.ref<f64>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<f64>
+! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<f64>
+! CHECK:             %[[VAL_13:.*]] = arith.addf %[[VAL_11]], %[[VAL_12]] fastmath<contract> : f64
+! CHECK:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<f64>
+! CHECK:           }
+! CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf64>>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func private @_QFPreduce(
+! CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "r"}) attributes {{.*}} {
+! CHECK:           %[[VAL_1:.*]] = fir.address_of(@_QFFreduceEi) : !fir.ref<i32>
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.box<!fir.array<?xf64>>) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_4:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFFreduceEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
+! CHECK:             fir.store %[[VAL_3]]#1 to %[[VAL_9]] : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_box_Uxf64 %[[VAL_9]] -> %[[VAL_10:.*]] : !fir.ref<!fir.box<!fir.array<?xf64>>>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_5]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {fortran_attrs = {{.*}}, uniq_name = "_QFFreduceEr"} : (!fir.ref<!fir.box<!fir.array<?xf64>>>) -> (!fir.ref<!fir.box<!fir.array<?xf64>>>, !fir.ref<!fir.box<!fir.array<?xf64>>>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> f64
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:               %[[VAL_16:.*]] = arith.constant 1 : index
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_15]] (%[[VAL_16]])  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+! CHECK:               hlfir.assign %[[VAL_14]] to %[[VAL_17]] : f64, !fir.ref<f64>
+! CHECK:               %[[VAL_18:.*]] = arith.constant 1.000000e+00 : f64
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<!fir.box<!fir.array<?xf64>>>
+! CHECK:               %[[VAL_20:.*]] = arith.constant 2 : index
+! CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_19]] (%[[VAL_20]])  : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+! CHECK:               hlfir.assign %[[VAL_18]] to %[[VAL_21]] : f64, !fir.ref<f64>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
-- 
cgit v1.2.3


From cc34ad91f0d0cfb4e568e67eedc8a6a3684b89ea Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Thu, 4 Apr 2024 11:19:42 +0100
Subject: [MLIR][OpenMP] Add cleanup region to omp.declare_reduction (#87377)

Currently, by-ref reductions will allocate the per-thread reduction
variable in the initialization region. Adding a cleanup region allows
that allocation to be undone. This will allow flang to support reduction
of arrays stored on the heap.

This conflation of allocation and initialization in the initialization
should be fixed in the future to better match the OpenMP standard, but
that is beyond the scope of this patch.
---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td      | 14 +++-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp       | 49 ++++++++---
 .../Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp   | 53 ++++++++++--
 mlir/test/Dialect/OpenMP/invalid.mlir              | 19 +++++
 mlir/test/Dialect/OpenMP/ops.mlir                  |  8 ++
 .../LLVMIR/openmp-parallel-reduction-cleanup.mlir  | 94 ++++++++++++++++++++++
 .../LLVMIR/openmp-wsloop-reduction-cleanup.mlir    | 86 ++++++++++++++++++++
 7 files changed, 299 insertions(+), 24 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index f33942b3c7c0..457451886e14 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -2135,8 +2135,8 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
   let summary = "declares a reduction kind";
 
   let description = [{
-    Declares an OpenMP reduction kind. This requires two mandatory and one
-    optional region.
+    Declares an OpenMP reduction kind. This requires two mandatory and two
+    optional regions.
 
       1. The initializer region specifies how to initialize the thread-local
          reduction value. This is usually the neutral element of the reduction.
@@ -2149,6 +2149,10 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
       3. The atomic reduction region is optional and specifies how two values
          can be combined atomically given local accumulator variables. It is
          expected to store the combined value in the first accumulator variable.
+      4. The cleanup region is optional and specifies how to clean up any memory
+         allocated by the initializer region. The region has an argument that
+         contains the value of the thread-local reduction accumulator. This will
+         be executed after the reduction has completed.
 
     Note that the MLIR type system does not allow for type-polymorphic
     reductions. Separate reduction declarations should be created for different
@@ -2163,12 +2167,14 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
 
   let regions = (region AnyRegion:$initializerRegion,
                         AnyRegion:$reductionRegion,
-                        AnyRegion:$atomicReductionRegion);
+                        AnyRegion:$atomicReductionRegion,
+                        AnyRegion:$cleanupRegion);
 
   let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword "
                        "`init` $initializerRegion "
                        "`combiner` $reductionRegion "
-                       "custom<AtomicReductionRegion>($atomicReductionRegion)";
+                       "custom<AtomicReductionRegion>($atomicReductionRegion) "
+                       "custom<CleanupReductionRegion>($cleanupRegion)";
 
   let extraClassDeclaration = [{
     PointerLikeType getAccumulatorType() {
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index bf5875071e0d..a04343154a4d 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1538,6 +1538,21 @@ static void printAtomicReductionRegion(OpAsmPrinter &printer,
   printer.printRegion(region);
 }
 
+static ParseResult parseCleanupReductionRegion(OpAsmParser &parser,
+                                               Region &region) {
+  if (parser.parseOptionalKeyword("cleanup"))
+    return success();
+  return parser.parseRegion(region);
+}
+
+static void printCleanupReductionRegion(OpAsmPrinter &printer,
+                                        DeclareReductionOp op, Region &region) {
+  if (region.empty())
+    return;
+  printer << "cleanup ";
+  printer.printRegion(region);
+}
+
 LogicalResult DeclareReductionOp::verifyRegions() {
   if (getInitializerRegion().empty())
     return emitOpError() << "expects non-empty initializer region";
@@ -1571,21 +1586,29 @@ LogicalResult DeclareReductionOp::verifyRegions() {
                               "of the reduction type";
   }
 
-  if (getAtomicReductionRegion().empty())
+  if (!getAtomicReductionRegion().empty()) {
+    Block &atomicReductionEntryBlock = getAtomicReductionRegion().front();
+    if (atomicReductionEntryBlock.getNumArguments() != 2 ||
+        atomicReductionEntryBlock.getArgumentTypes()[0] !=
+            atomicReductionEntryBlock.getArgumentTypes()[1])
+      return emitOpError() << "expects atomic reduction region with two "
+                              "arguments of the same type";
+    auto ptrType = llvm::dyn_cast<PointerLikeType>(
+        atomicReductionEntryBlock.getArgumentTypes()[0]);
+    if (!ptrType ||
+        (ptrType.getElementType() && ptrType.getElementType() != getType()))
+      return emitOpError() << "expects atomic reduction region arguments to "
+                              "be accumulators containing the reduction type";
+  }
+
+  if (getCleanupRegion().empty())
     return success();
+  Block &cleanupEntryBlock = getCleanupRegion().front();
+  if (cleanupEntryBlock.getNumArguments() != 1 ||
+      cleanupEntryBlock.getArgument(0).getType() != getType())
+    return emitOpError() << "expects cleanup region with one argument "
+                            "of the reduction type";
 
-  Block &atomicReductionEntryBlock = getAtomicReductionRegion().front();
-  if (atomicReductionEntryBlock.getNumArguments() != 2 ||
-      atomicReductionEntryBlock.getArgumentTypes()[0] !=
-          atomicReductionEntryBlock.getArgumentTypes()[1])
-    return emitOpError() << "expects atomic reduction region with two "
-                            "arguments of the same type";
-  auto ptrType = llvm::dyn_cast<PointerLikeType>(
-      atomicReductionEntryBlock.getArgumentTypes()[0]);
-  if (!ptrType ||
-      (ptrType.getElementType() && ptrType.getElementType() != getType()))
-    return emitOpError() << "expects atomic reduction region arguments to "
-                            "be accumulators containing the reduction type";
   return success();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index c4bf6a20ebe7..08ec57803aff 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -877,6 +877,32 @@ static void collectReductionInfo(
   }
 }
 
+/// handling of DeclareReductionOp's cleanup region
+static LogicalResult inlineReductionCleanup(
+    llvm::SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
+    llvm::ArrayRef<llvm::Value *> privateReductionVariables,
+    LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder) {
+  for (auto [i, reductionDecl] : llvm::enumerate(reductionDecls)) {
+    Region &cleanupRegion = reductionDecl.getCleanupRegion();
+    if (cleanupRegion.empty())
+      continue;
+
+    // map the argument to the cleanup region
+    Block &entry = cleanupRegion.front();
+    moduleTranslation.mapValue(entry.getArgument(0),
+                               privateReductionVariables[i]);
+
+    if (failed(inlineConvertOmpRegions(cleanupRegion, "omp.reduction.cleanup",
+                                       builder, moduleTranslation)))
+      return failure();
+
+    // clear block argument mapping in case it needs to be re-created with a
+    // different source for another use of the same reduction decl
+    moduleTranslation.forgetMapping(cleanupRegion);
+  }
+  return success();
+}
+
 /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -1072,7 +1098,9 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   tempTerminator->eraseFromParent();
   builder.restoreIP(nextInsertionPoint);
 
-  return success();
+  // after the workshare loop, deallocate private reduction variables
+  return inlineReductionCleanup(reductionDecls, privateReductionVariables,
+                                moduleTranslation, builder);
 }
 
 /// A RAII class that on construction replaces the region arguments of the
@@ -1125,13 +1153,13 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   LogicalResult bodyGenStatus = success();
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
-    // Collect reduction declarations
-    SmallVector<omp::DeclareReductionOp> reductionDecls;
-    collectReductionDecls(opInst, reductionDecls);
+  // Collect reduction declarations
+  SmallVector<omp::DeclareReductionOp> reductionDecls;
+  collectReductionDecls(opInst, reductionDecls);
+  SmallVector<llvm::Value *> privateReductionVariables;
 
+  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // Allocate reduction vars
-    SmallVector<llvm::Value *> privateReductionVariables;
     DenseMap<Value, llvm::Value *> reductionVariableMap;
     if (!isByRef) {
       allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP,
@@ -1331,7 +1359,18 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
   // TODO: Perform finalization actions for variables. This has to be
   // called for variables which have destructors/finalizers.
-  auto finiCB = [&](InsertPointTy codeGenIP) {};
+  auto finiCB = [&](InsertPointTy codeGenIP) {
+    InsertPointTy oldIP = builder.saveIP();
+    builder.restoreIP(codeGenIP);
+
+    // if the reduction has a cleanup region, inline it here to finalize the
+    // reduction variables
+    if (failed(inlineReductionCleanup(reductionDecls, privateReductionVariables,
+                                      moduleTranslation, builder)))
+      bodyGenStatus = failure();
+
+    builder.restoreIP(oldIP);
+  };
 
   llvm::Value *ifCond = nullptr;
   if (auto ifExprVar = opInst.getIfExprVar())
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index a00383cf4405..1134db77d5ba 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -436,6 +436,25 @@ atomic {
 
 // -----
 
+// expected-error @below {{op expects cleanup region with one argument of the reduction type}}
+omp.declare_reduction @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = arith.constant 0.0 : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = arith.addf %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+cleanup {
+^bb0(%arg: f64):
+  omp.yield
+}
+
+// -----
+
 func.func @foo(%lb : index, %ub : index, %step : index) {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 30ce77423005..e2c255c7a3cc 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -603,6 +603,8 @@ func.func @omp_target_pretty(%if_cond : i1, %device : si32,  %num_threads : i32)
 // CHECK: atomic
 // CHECK: ^{{.+}}(%{{.+}}: !llvm.ptr, %{{.+}}: !llvm.ptr):
 // CHECK:  omp.yield
+// CHECK: cleanup
+// CHECK:  omp.yield
 omp.declare_reduction @add_f32 : f32
 init {
 ^bb0(%arg: f32):
@@ -620,6 +622,10 @@ atomic {
   llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
   omp.yield
 }
+cleanup {
+^bb0(%arg: f32):
+  omp.yield
+}
 
 // CHECK-LABEL: func @wsloop_reduction
 func.func @wsloop_reduction(%lb : index, %ub : index, %step : index) {
@@ -789,6 +795,7 @@ combiner {
   omp.yield (%1 : f32)
 }
 // CHECK-NOT: atomic
+// CHECK-NOT: cleanup
 
 // CHECK-LABEL: func @wsloop_reduction2
 func.func @wsloop_reduction2(%lb : index, %ub : index, %step : index) {
@@ -2088,6 +2095,7 @@ func.func @opaque_pointers_atomic_rwu(%v: !llvm.ptr, %x: !llvm.ptr) {
 // CHECK-LABEL: @opaque_pointers_reduction
 // CHECK: atomic {
 // CHECK-NEXT: ^{{[[:alnum:]]+}}(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr):
+// CHECK-NOT: cleanup
 omp.declare_reduction @opaque_pointers_reduction : f32
 init {
 ^bb0(%arg: f32):
diff --git a/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir
new file mode 100644
index 000000000000..9ae4c4ad392b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-parallel-reduction-cleanup.mlir
@@ -0,0 +1,94 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// test a parallel reduction with a cleanup region
+
+  omp.declare_reduction @add_reduction_i_32 : !llvm.ptr init {
+  ^bb0(%arg0: !llvm.ptr):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %c4 = llvm.mlir.constant(4 : i64) : i64
+    %2 = llvm.call @malloc(%c4) : (i64) -> !llvm.ptr
+    llvm.store %0, %2 : i32, !llvm.ptr
+    omp.yield(%2 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.load %arg0 : !llvm.ptr -> i32
+    %1 = llvm.load %arg1 : !llvm.ptr -> i32
+    %2 = llvm.add %0, %1  : i32
+    llvm.store %2, %arg0 : i32, !llvm.ptr
+    omp.yield(%arg0 : !llvm.ptr)
+  } cleanup {
+  ^bb0(%arg0: !llvm.ptr):
+    llvm.call @free(%arg0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+
+  // CHECK-LABEL: @main
+  llvm.func @main()  {
+    %0 = llvm.mlir.constant(-1 : i32) : i32
+    %1 = llvm.mlir.addressof @i : !llvm.ptr
+    %2 = llvm.mlir.addressof @j : !llvm.ptr
+    omp.parallel byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) {
+      llvm.store %0, %arg0 : i32, !llvm.ptr
+      llvm.store %0, %arg1 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.mlir.global internal @j() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.func @malloc(%arg0 : i64) -> !llvm.ptr
+  llvm.func @free(%arg0 : !llvm.ptr) -> ()
+
+// CHECK: %{{.+}} = 
+// Call to the outlined function.
+// CHECK: call void {{.*}} @__kmpc_fork_call
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %tid.addr.local = alloca i32
+// CHECK: %[[MALLOC_I:.+]] = call ptr @malloc(i64 4)
+// CHECK: %[[PRIV_PTR_I:.+]] = alloca ptr
+// CHECK: store ptr %[[MALLOC_I]], ptr %[[PRIV_PTR_I]]
+// CHECK: %[[MALLOC_J:.+]] = call ptr @malloc(i64 4)
+// CHECK: %[[PRIV_PTR_J:.+]] = alloca ptr
+// CHECK: store ptr %[[MALLOC_J]], ptr %[[PRIV_PTR_J]]
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+
+// Non-atomic reduction:
+// CHECK: %[[PRIV_VAL_PTR_I:.+]] = load ptr, ptr %[[PRIV_PTR_I]]
+// CHECK: %[[LOAD_I:.+]] = load i32, ptr @i
+// CHECK: %[[PRIV_VAL_I:.+]] = load i32, ptr %[[PRIV_VAL_PTR_I]]
+// CHECK: %[[SUM_I:.+]] = add i32 %[[LOAD_I]], %[[PRIV_VAL_I]]
+// CHECK: store i32 %[[SUM_I]], ptr @i
+// CHECK: %[[PRIV_VAL_PTR_J:.+]] = load ptr, ptr %[[PRIV_PTR_J]]
+// CHECK: %[[LOAD_J:.+]] = load i32, ptr @j
+// CHECK: %[[PRIV_VAL_J:.+]] = load i32, ptr %[[PRIV_VAL_PTR_J]]
+// CHECK: %[[SUM_J:.+]] = add i32 %[[LOAD_J]], %[[PRIV_VAL_J]]
+// CHECK: store i32 %[[SUM_J]], ptr @j
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+// CHECK: br label %[[OMP_FINALIZE:.+]]
+
+// Cleanup region:
+// CHECK: [[OMP_FINALIZE]]:
+// CHECK: call void @free(ptr %[[PRIV_PTR_I]])
+// CHECK: call void @free(ptr %[[PRIV_PTR_J]])
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: add i32
diff --git a/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
new file mode 100644
index 000000000000..a1e17afa53e2
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-wsloop-reduction-cleanup.mlir
@@ -0,0 +1,86 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// test a wsloop reduction with a cleanup region
+
+  omp.declare_reduction @add_reduction_i_32 : !llvm.ptr init {
+  ^bb0(%arg0: !llvm.ptr):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %c4 = llvm.mlir.constant(4 : i64) : i64
+    %2 = llvm.call @malloc(%c4) : (i64) -> !llvm.ptr
+    llvm.store %0, %2 : i32, !llvm.ptr
+    omp.yield(%2 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.load %arg0 : !llvm.ptr -> i32
+    %1 = llvm.load %arg1 : !llvm.ptr -> i32
+    %2 = llvm.add %0, %1  : i32
+    llvm.store %2, %arg0 : i32, !llvm.ptr
+    omp.yield(%arg0 : !llvm.ptr)
+  } cleanup {
+  ^bb0(%arg0: !llvm.ptr):
+    llvm.call @free(%arg0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+
+  // CHECK-LABEL: @main
+  llvm.func @main()  {
+    %0 = llvm.mlir.constant(-1 : i32) : i32
+    %1 = llvm.mlir.addressof @i : !llvm.ptr
+    %2 = llvm.mlir.addressof @j : !llvm.ptr
+    %loop_ub = llvm.mlir.constant(9 : i32) : i32
+    %loop_lb = llvm.mlir.constant(0 : i32) : i32
+    %loop_step = llvm.mlir.constant(1 : i32) : i32 
+    omp.wsloop byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr, @add_reduction_i_32 %2 -> %arg1 : !llvm.ptr) for (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+      llvm.store %0, %arg0 : i32, !llvm.ptr
+      llvm.store %0, %arg1 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.mlir.global internal @j() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.func @malloc(%arg0 : i64) -> !llvm.ptr
+  llvm.func @free(%arg0 : !llvm.ptr) -> ()
+
+// Private reduction variable and its initialization.
+// CHECK: %[[MALLOC_I:.+]] = call ptr @malloc(i64 4)
+// CHECK: %[[PRIV_PTR_I:.+]] = alloca ptr
+// CHECK: store ptr %[[MALLOC_I]], ptr %[[PRIV_PTR_I]]
+// CHECK: %[[MALLOC_J:.+]] = call ptr @malloc(i64 4)
+// CHECK: %[[PRIV_PTR_J:.+]] = alloca ptr
+// CHECK: store ptr %[[MALLOC_J]], ptr %[[PRIV_PTR_J]]
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Weirdly the finalization block is generated before the reduction blocks:
+// CHECK: [[FINALIZE:.+]]:
+// CHECK: call void @__kmpc_barrier
+// CHECK: call void @free(ptr %[[PRIV_PTR_I]])
+// CHECK: call void @free(ptr %[[PRIV_PTR_J]])
+// CHECK: ret void
+
+// Non-atomic reduction:
+// CHECK: %[[PRIV_VAL_PTR_I:.+]] = load ptr, ptr %[[PRIV_PTR_I]]
+// CHECK: %[[LOAD_I:.+]] = load i32, ptr @i
+// CHECK: %[[PRIV_VAL_I:.+]] = load i32, ptr %[[PRIV_VAL_PTR_I]]
+// CHECK: %[[SUM_I:.+]] = add i32 %[[LOAD_I]], %[[PRIV_VAL_I]]
+// CHECK: store i32 %[[SUM_I]], ptr @i
+// CHECK: %[[PRIV_VAL_PTR_J:.+]] = load ptr, ptr %[[PRIV_PTR_J]]
+// CHECK: %[[LOAD_J:.+]] = load i32, ptr @j
+// CHECK: %[[PRIV_VAL_J:.+]] = load i32, ptr %[[PRIV_VAL_PTR_J]]
+// CHECK: %[[SUM_J:.+]] = add i32 %[[LOAD_J]], %[[PRIV_VAL_J]]
+// CHECK: store i32 %[[SUM_J]], ptr @j
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE]]
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: add i32
-- 
cgit v1.2.3


From 212b2bbcd1583353bccef3418e94912a30775715 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 11:22:37 +0100
Subject: [VectorCombine][X86] foldShuffleOfCastops - fold
 shuffle(cast(x),cast(y)) -> cast(shuffle(x,y)) iff cost efficient (#87510)

Based off the existing foldShuffleOfBinops fold

Fixes #67803
---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp    | 71 +++++++++++++++++++
 llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll  |  6 +-
 .../VectorCombine/X86/shuffle-of-casts.ll          | 79 +++++++++++-----------
 3 files changed, 110 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index af5e7c9bc385..3738220b4f81 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -112,6 +112,7 @@ private:
   bool foldSingleElementStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
   bool foldShuffleOfBinops(Instruction &I);
+  bool foldShuffleOfCastops(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
   bool foldTruncFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -1432,6 +1433,75 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
   return true;
 }
 
+/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
+/// into "castop (shuffle)".
+bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
+  Value *V0, *V1;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
+                           m_Mask(Mask))))
+    return false;
+
+  auto *C0 = dyn_cast<CastInst>(V0);
+  auto *C1 = dyn_cast<CastInst>(V1);
+  if (!C0 || !C1)
+    return false;
+
+  Instruction::CastOps Opcode = C0->getOpcode();
+  if (Opcode == Instruction::BitCast || C0->getSrcTy() != C1->getSrcTy())
+    return false;
+
+  // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
+  if (Opcode != C1->getOpcode()) {
+    if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
+      Opcode = Instruction::SExt;
+    else
+      return false;
+  }
+
+  auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
+  auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
+  auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
+  if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
+    return false;
+  assert(CastDstTy->getElementCount() == CastSrcTy->getElementCount() &&
+         "Unexpected src/dst element counts");
+
+  auto *NewShuffleDstTy =
+      FixedVectorType::get(CastSrcTy->getScalarType(), Mask.size());
+
+  // Try to replace a castop with a shuffle if the shuffle is not costly.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  InstructionCost OldCost =
+      TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
+                           TTI::CastContextHint::None, CostKind) +
+      TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
+                           TTI::CastContextHint::None, CostKind);
+  OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+                                CastDstTy, Mask, CostKind);
+
+  InstructionCost NewCost = TTI.getShuffleCost(
+      TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, Mask, CostKind);
+  NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
+                                  TTI::CastContextHint::None, CostKind);
+  if (NewCost > OldCost)
+    return false;
+
+  Value *Shuf =
+      Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0), Mask);
+  Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
+
+  // Intersect flags from the old casts.
+  if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
+    NewInst->copyIRFlags(C0);
+    NewInst->andIRFlags(C1);
+  }
+
+  replaceValue(I, *Cast);
+  return true;
+}
+
 /// Given a commutative reduction, the order of the input lanes does not alter
 /// the results. We can use this to remove certain shuffles feeding the
 /// reduction, removing the need to shuffle at all.
@@ -1986,6 +2056,7 @@ bool VectorCombine::run() {
         break;
       case Instruction::ShuffleVector:
         MadeChange |= foldShuffleOfBinops(I);
+        MadeChange |= foldShuffleOfCastops(I);
         MadeChange |= foldSelectShuffle(I);
         break;
       case Instruction::BitCast:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
index 495ec0a63399..45e411d73316 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
@@ -9,11 +9,7 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
-; CHECK-NEXT:    [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
index f804300b1022..2031c2d04c60 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -6,9 +6,8 @@
 
 define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: @concat_zext_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = zext <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = zext <8 x i16> [[A1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = zext <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = zext <8 x i16> %a0 to <8 x i32>
@@ -19,9 +18,8 @@ define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 
 define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: @concat_zext_nneg_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = zext nneg <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = zext nneg <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = zext nneg <8 x i16> %a0 to <8 x i32>
@@ -30,13 +28,17 @@ define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
   ret <16 x i32> %r
 }
 
-; TODO - sext + zext nneg -> sext
 define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
-; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <16 x i32> [[R]]
+; SSE-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
+; SSE-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
+; SSE-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32>
+; SSE-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i32> [[R]]
+;
+; AVX-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; AVX-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = sext <8 x i16> %a0 to <8 x i32>
   %x1 = zext nneg <8 x i16> %a1 to <8 x i32>
@@ -46,9 +48,8 @@ define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a
 
 define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: @concat_sext_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = sext <8 x i16> [[A1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = sext <8 x i16> %a0 to <8 x i32>
@@ -59,9 +60,8 @@ define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 
 define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) {
 ; CHECK-LABEL: @concat_sext_v4i1_v8i32(
-; CHECK-NEXT:    [[X0:%.*]] = sext <4 x i1> [[A0:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = sext <4 x i1> [[A1:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %x0 = sext <4 x i1> %a0 to <4 x i32>
@@ -72,9 +72,8 @@ define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) {
 
 define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @concat_trunc_v4i32_v8i16(
-; CHECK-NEXT:    [[X0:%.*]] = trunc <4 x i32> [[A0:%.*]] to <4 x i16>
-; CHECK-NEXT:    [[X1:%.*]] = trunc <4 x i32> [[A1:%.*]] to <4 x i16>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[X0]], <4 x i16> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %x0 = trunc <4 x i32> %a0 to <4 x i16>
@@ -85,9 +84,8 @@ define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) {
 
 define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @concat_inttoptr_v4i32_v8iptr(
-; CHECK-NEXT:    [[X0:%.*]] = inttoptr <4 x i32> [[A0:%.*]] to <4 x ptr>
-; CHECK-NEXT:    [[X1:%.*]] = inttoptr <4 x i32> [[A1:%.*]] to <4 x ptr>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x ptr> [[X0]], <4 x ptr> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = inttoptr <8 x i32> [[TMP1]] to <8 x ptr>
 ; CHECK-NEXT:    ret <8 x ptr> [[R]]
 ;
   %x0 = inttoptr <4 x i32> %a0 to <4 x ptr>
@@ -98,9 +96,8 @@ define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) {
 
 define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) {
 ; CHECK-LABEL: @concat_ptrtoint_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = ptrtoint <8 x ptr> [[A0:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[X1:%.*]] = ptrtoint <8 x ptr> [[A1:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i64> [[X0]], <8 x i64> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[A0:%.*]], <8 x ptr> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint <16 x ptr> [[TMP1]] to <16 x i64>
 ; CHECK-NEXT:    ret <16 x i64> [[R]]
 ;
   %x0 = ptrtoint <8 x ptr> %a0 to <8 x i64>
@@ -110,11 +107,16 @@ define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) {
 }
 
 define <8 x double> @concat_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: @concat_fpext_v4f32_v8f64(
-; CHECK-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double>
-; CHECK-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x double> [[R]]
+; SSE-LABEL: @concat_fpext_v4f32_v8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
+; SSE-NEXT:    ret <8 x double> [[R]]
+;
+; AVX-LABEL: @concat_fpext_v4f32_v8f64(
+; AVX-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double>
+; AVX-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x double> [[R]]
 ;
   %x0 = fpext <4 x float> %a0 to <4 x double>
   %x1 = fpext <4 x float> %a1 to <4 x double>
@@ -139,9 +141,8 @@ define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double>
 
 define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: @rconcat_sext_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = sext <8 x i16> [[A1:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = sext <8 x i16> %a0 to <8 x i32>
@@ -154,9 +155,8 @@ define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 
 define <8 x double> @interleave_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: @interleave_fpext_v4f32_v8f64(
-; CHECK-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double>
-; CHECK-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
 ; CHECK-NEXT:    ret <8 x double> [[R]]
 ;
   %x0 = fpext <4 x float> %a0 to <4 x double>
@@ -226,6 +226,3 @@ define <16 x i32> @concat_sext_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
   %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i32> %r
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
-- 
cgit v1.2.3


From bf2d7858e5998bed28d437973588e0294bd54bcf Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Thu, 4 Apr 2024 03:25:41 -0700
Subject: Fix bug in the type promotion for complex division in strict FP mode.
 (#87500)

Complex division on Windows with `-fcomplex-arithmetic=promoted` and
`-ffp-model=strict` is crashing. This patch fixes the issue.
See https://godbolt.org/z/15Gh7nvdM
---
 clang/lib/CodeGen/CGExprComplex.cpp   |   6 +-
 clang/test/CodeGen/cx-complex-range.c | 656 ++++++++++++++++++++++++++++++++++
 2 files changed, 659 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index a793b214645c..1facadd82f17 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -319,12 +319,12 @@ public:
     // doubles the exponent of SmallerType.LargestFiniteVal)
     if (llvm::APFloat::semanticsMaxExponent(ElementTypeSemantics) * 2 + 1 <=
         llvm::APFloat::semanticsMaxExponent(HigherElementTypeSemantics)) {
+      FPHasBeenPromoted = true;
       return CGF.getContext().getComplexType(HigherElementType);
     } else {
-      FPHasBeenPromoted = true;
       DiagnosticsEngine &Diags = CGF.CGM.getDiags();
       Diags.Report(diag::warn_next_larger_fp_type_same_size_than_fp);
-      return CGF.getContext().getComplexType(ElementType);
+      return QualType();
     }
   }
 
@@ -1037,7 +1037,7 @@ ComplexPairTy ComplexExprEmitter::EmitBinDiv(const BinOpInfo &Op) {
       LHSi = llvm::Constant::getNullValue(RHSi->getType());
     if (Op.FPFeatures.getComplexRange() == LangOptions::CX_Improved ||
         (Op.FPFeatures.getComplexRange() == LangOptions::CX_Promoted &&
-         FPHasBeenPromoted))
+         !FPHasBeenPromoted))
       return EmitRangeReductionDiv(LHSr, LHSi, RHSr, RHSi);
     else if (Op.FPFeatures.getComplexRange() == LangOptions::CX_Basic ||
              Op.FPFeatures.getComplexRange() == LangOptions::CX_Promoted)
diff --git a/clang/test/CodeGen/cx-complex-range.c b/clang/test/CodeGen/cx-complex-range.c
index 9ec80252085b..38f99230eea5 100644
--- a/clang/test/CodeGen/cx-complex-range.c
+++ b/clang/test/CodeGen/cx-complex-range.c
@@ -48,6 +48,15 @@
 // RUN: -ffast-math -complex-range=promoted -emit-llvm -o - %s \
 // RUN: | FileCheck %s --check-prefix=PRMTD_FAST
 
+// strict math mode
+// RUN: %clang_cc1 -triple x86_64-windows-pc -complex-range=promoted \
+// RUN: -ffp-contract=off -frounding-math -ffp-exception-behavior=strict \
+// RUN: -emit-llvm -o - %s | FileCheck %s --check-prefix=X86WINPRMTD_STRICT
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -complex-range=promoted \
+// RUN: -ffp-contract=off -frounding-math -ffp-exception-behavior=strict \
+// RUN: -emit-llvm -o - %s | FileCheck %s --check-prefix=PRMTD_STRICT
+
 // FULL-LABEL: define dso_local <2 x float> @divf(
 // FULL-SAME: <2 x float> noundef [[A_COERCE:%.*]], <2 x float> noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
 // FULL-NEXT:  entry:
@@ -504,6 +513,86 @@
 // PRMTD_FAST-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
 // PRMTD_FAST-NEXT:    ret <2 x float> [[TMP11]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local i64 @divf(
+// X86WINPRMTD_STRICT-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[A_COERCE]], ptr [[A]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[B_COERCE]], ptr [[B]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[EXT:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_REAL]], metadata !"fpexcept.strict") #[[ATTR3:[0-9]+]]
+// X86WINPRMTD_STRICT-NEXT:    [[EXT1:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_IMAG]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[EXT2:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[B_REAL]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[EXT3:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[B_IMAG]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT1]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP0]], double [[TMP1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT2]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT3]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP3]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT1]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP6]], double [[TMP7]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP2]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP8]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[UNPROMOTION:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP9]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[UNPROMOTION4:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store float [[UNPROMOTION]], ptr [[RETVAL_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store float [[UNPROMOTION4]], ptr [[RETVAL_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[TMP11:%.*]] = load i64, ptr [[RETVAL]], align 4
+// X86WINPRMTD_STRICT-NEXT:    ret i64 [[TMP11]]
+//
+// PRMTD_STRICT-LABEL: define dso_local <2 x float> @divf(
+// PRMTD_STRICT-SAME: <2 x float> noundef [[A_COERCE:%.*]], <2 x float> noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[A_COERCE]], ptr [[A]], align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[B_COERCE]], ptr [[B]], align 4
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[EXT:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_REAL]], metadata !"fpexcept.strict") #[[ATTR4:[0-9]+]]
+// PRMTD_STRICT-NEXT:    [[EXT1:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[EXT2:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[B_REAL]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT3:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[B_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT1]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP0]], double [[TMP1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT2]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT3]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP3]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT1]], double [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP6]], double [[TMP7]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP2]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP8]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP9]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION4:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store float [[UNPROMOTION]], ptr [[RETVAL_REALP]], align 4
+// PRMTD_STRICT-NEXT:    store float [[UNPROMOTION4]], ptr [[RETVAL_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
+// PRMTD_STRICT-NEXT:    ret <2 x float> [[TMP11]]
+//
 _Complex float divf(_Complex float a, _Complex float b) {
   return a / b;
 }
@@ -873,6 +962,64 @@ _Complex float divf(_Complex float a, _Complex float b) {
 // PRMTD_FAST-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
 // PRMTD_FAST-NEXT:    ret <2 x float> [[TMP0]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local i64 @mulf(
+// X86WINPRMTD_STRICT-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[A_COERCE]], ptr [[A]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[B_COERCE]], ptr [[B]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_REAL]], float [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_IMAG]], float [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_REAL]], float [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_IMAG]], float [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[MUL_AC]], float [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[MUL_AD]], float [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store float [[MUL_R]], ptr [[RETVAL_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store float [[MUL_I]], ptr [[RETVAL_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = load i64, ptr [[RETVAL]], align 4
+// X86WINPRMTD_STRICT-NEXT:    ret i64 [[TMP0]]
+//
+// PRMTD_STRICT-LABEL: define dso_local <2 x float> @mulf(
+// PRMTD_STRICT-SAME: <2 x float> noundef [[A_COERCE:%.*]], <2 x float> noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[A_COERCE]], ptr [[A]], align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[B_COERCE]], ptr [[B]], align 4
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_REAL]], float [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_IMAG]], float [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_REAL]], float [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call float @llvm.experimental.constrained.fmul.f32(float [[A_IMAG]], float [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[MUL_AC]], float [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[MUL_AD]], float [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store float [[MUL_R]], ptr [[RETVAL_REALP]], align 4
+// PRMTD_STRICT-NEXT:    store float [[MUL_I]], ptr [[RETVAL_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
+// PRMTD_STRICT-NEXT:    ret <2 x float> [[TMP0]]
+//
 _Complex float mulf(_Complex float a, _Complex float b) {
   return a * b;
 }
@@ -1411,6 +1558,112 @@ _Complex float mulf(_Complex float a, _Complex float b) {
 // PRMTD_FAST-NEXT:    [[TMP15:%.*]] = load { double, double }, ptr [[RETVAL]], align 8
 // PRMTD_FAST-NEXT:    ret { double, double } [[TMP15]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local void @divd(
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[B]], ptr [[B_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[A]], ptr [[A_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call double @llvm.fabs.f64(double [[B_REAL]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call double @llvm.fabs.f64(double [[B_IMAG]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[ABS_CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP0]], double [[TMP1]], metadata !"ugt", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_greater_or_equal_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[B_IMAG]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP2]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[B_REAL]], double [[TMP3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[A_REAL]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP6]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[A_IMAG]], double [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP9]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_less_than_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[B_REAL]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP11]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[B_IMAG]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[A_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP16:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP15]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP17:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP18:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP17]], double [[A_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP19:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP18]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV]]
+// X86WINPRMTD_STRICT:       complex_div:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP20:%.*]] = phi double [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP21:%.*]] = phi double [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[TMP20]], ptr [[AGG_RESULT_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[TMP21]], ptr [[AGG_RESULT_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP1:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REAL:%.*]] = load double, ptr [[AGG_RESULT_REALP1]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP2:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAG:%.*]] = load double, ptr [[AGG_RESULT_IMAGP2]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP3:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP4:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_REAL]], ptr [[AGG_RESULT_REALP3]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_IMAG]], ptr [[AGG_RESULT_IMAGP4]], align 8
+// X86WINPRMTD_STRICT-NEXT:    ret void
+//
+// PRMTD_STRICT-LABEL: define dso_local { double, double } @divd(
+// PRMTD_STRICT-SAME: double noundef [[A_COERCE0:%.*]], double noundef [[A_COERCE1:%.*]], double noundef [[B_COERCE0:%.*]], double noundef [[B_COERCE1:%.*]]) #[[ATTR2:[0-9]+]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    store double [[A_COERCE0]], ptr [[TMP0]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[A_COERCE1]], ptr [[TMP1]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    store double [[B_COERCE0]], ptr [[TMP2]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[B_COERCE1]], ptr [[TMP3]], align 8
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[EXT:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double [[A_REAL]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT1:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double [[A_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[EXT2:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double [[B_REAL]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT3:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double [[B_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT]], x86_fp80 [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT1]], x86_fp80 [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[TMP4]], x86_fp80 [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT2]], x86_fp80 [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT3]], x86_fp80 [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[TMP7]], x86_fp80 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT1]], x86_fp80 [[EXT2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[EXT]], x86_fp80 [[EXT3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[TMP10]], x86_fp80 [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP6]], x86_fp80 [[TMP9]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP12]], x86_fp80 [[TMP9]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION:%.*]] = call double @llvm.experimental.constrained.fptrunc.f64.f80(x86_fp80 [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION4:%.*]] = call double @llvm.experimental.constrained.fptrunc.f64.f80(x86_fp80 [[TMP14]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[UNPROMOTION]], ptr [[RETVAL_REALP]], align 8
+// PRMTD_STRICT-NEXT:    store double [[UNPROMOTION4]], ptr [[RETVAL_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP15:%.*]] = load { double, double }, ptr [[RETVAL]], align 8
+// PRMTD_STRICT-NEXT:    ret { double, double } [[TMP15]]
+//
 _Complex double divd(_Complex double a, _Complex double b) {
   return a / b;
 }
@@ -1834,6 +2087,78 @@ _Complex double divd(_Complex double a, _Complex double b) {
 // PRMTD_FAST-NEXT:    [[TMP4:%.*]] = load { double, double }, ptr [[RETVAL]], align 8
 // PRMTD_FAST-NEXT:    ret { double, double } [[TMP4]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local void @muld(
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[B]], ptr [[B_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[A]], ptr [[A_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[MUL_AC]], double [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[MUL_AD]], double [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[MUL_R]], ptr [[AGG_RESULT_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[MUL_I]], ptr [[AGG_RESULT_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP1:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REAL:%.*]] = load double, ptr [[AGG_RESULT_REALP1]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP2:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAG:%.*]] = load double, ptr [[AGG_RESULT_IMAGP2]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP3:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP4:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_REAL]], ptr [[AGG_RESULT_REALP3]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_IMAG]], ptr [[AGG_RESULT_IMAGP4]], align 8
+// X86WINPRMTD_STRICT-NEXT:    ret void
+//
+// PRMTD_STRICT-LABEL: define dso_local { double, double } @muld(
+// PRMTD_STRICT-SAME: double noundef [[A_COERCE0:%.*]], double noundef [[A_COERCE1:%.*]], double noundef [[B_COERCE0:%.*]], double noundef [[B_COERCE1:%.*]]) #[[ATTR2]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[B:%.*]] = alloca { double, double }, align 8
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    store double [[A_COERCE0]], ptr [[TMP0]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[A_COERCE1]], ptr [[TMP1]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    store double [[B_COERCE0]], ptr [[TMP2]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[B_COERCE1]], ptr [[TMP3]], align 8
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[MUL_AC]], double [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[MUL_AD]], double [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store double [[MUL_R]], ptr [[RETVAL_REALP]], align 8
+// PRMTD_STRICT-NEXT:    store double [[MUL_I]], ptr [[RETVAL_IMAGP]], align 8
+// PRMTD_STRICT-NEXT:    [[TMP4:%.*]] = load { double, double }, ptr [[RETVAL]], align 8
+// PRMTD_STRICT-NEXT:    ret { double, double } [[TMP4]]
+//
 _Complex double muld(_Complex double a, _Complex double b) {
   return a * b;
 }
@@ -2316,6 +2641,114 @@ _Complex double muld(_Complex double a, _Complex double b) {
 // PRMTD_FAST-NEXT:    [[TMP22:%.*]] = load { x86_fp80, x86_fp80 }, ptr [[RETVAL]], align 16
 // PRMTD_FAST-NEXT:    ret { x86_fp80, x86_fp80 } [[TMP22]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local void @divld(
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[B]], ptr [[B_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[A]], ptr [[A_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call double @llvm.fabs.f64(double [[B_REAL]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call double @llvm.fabs.f64(double [[B_IMAG]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[ABS_CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP0]], double [[TMP1]], metadata !"ugt", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_greater_or_equal_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[B_IMAG]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP2]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[B_REAL]], double [[TMP3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[A_REAL]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP6]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[A_IMAG]], double [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP9]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_less_than_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[B_REAL]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP11]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[B_IMAG]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[A_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP16:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP15]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP17:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP18:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP17]], double [[A_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP19:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP18]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV]]
+// X86WINPRMTD_STRICT:       complex_div:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP20:%.*]] = phi double [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP21:%.*]] = phi double [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[TMP20]], ptr [[AGG_RESULT_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[TMP21]], ptr [[AGG_RESULT_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP1:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REAL:%.*]] = load double, ptr [[AGG_RESULT_REALP1]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP2:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAG:%.*]] = load double, ptr [[AGG_RESULT_IMAGP2]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP3:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP4:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_REAL]], ptr [[AGG_RESULT_REALP3]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_IMAG]], ptr [[AGG_RESULT_IMAGP4]], align 8
+// X86WINPRMTD_STRICT-NEXT:    ret void
+//
+// PRMTD_STRICT-LABEL: define dso_local { x86_fp80, x86_fp80 } @divld(
+// PRMTD_STRICT-SAME: ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 [[A:%.*]], ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 [[B:%.*]]) #[[ATTR2]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { x86_fp80, x86_fp80 }, align 16
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load x86_fp80, ptr [[A_REALP]], align 16
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load x86_fp80, ptr [[A_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load x86_fp80, ptr [[B_REALP]], align 16
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load x86_fp80, ptr [[B_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 [[B_REAL]]) #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 [[B_IMAG]]) #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[ABS_CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f80(x86_fp80 [[TMP0]], x86_fp80 [[TMP1]], metadata !"ugt", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// PRMTD_STRICT:       abs_rhsr_greater_or_equal_abs_rhsi:
+// PRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[B_IMAG]], x86_fp80 [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[TMP2]], x86_fp80 [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[B_REAL]], x86_fp80 [[TMP3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_IMAG]], x86_fp80 [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[A_REAL]], x86_fp80 [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP6]], x86_fp80 [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_REAL]], x86_fp80 [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[A_IMAG]], x86_fp80 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP9]], x86_fp80 [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// PRMTD_STRICT:       abs_rhsr_less_than_abs_rhsi:
+// PRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[B_REAL]], x86_fp80 [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[TMP11]], x86_fp80 [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[B_IMAG]], x86_fp80 [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_REAL]], x86_fp80 [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP15:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[TMP14]], x86_fp80 [[A_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP16:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP15]], x86_fp80 [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP17:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_IMAG]], x86_fp80 [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP18:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[TMP17]], x86_fp80 [[A_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP19:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP18]], x86_fp80 [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV]]
+// PRMTD_STRICT:       complex_div:
+// PRMTD_STRICT-NEXT:    [[TMP20:%.*]] = phi x86_fp80 [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// PRMTD_STRICT-NEXT:    [[TMP21:%.*]] = phi x86_fp80 [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store x86_fp80 [[TMP20]], ptr [[RETVAL_REALP]], align 16
+// PRMTD_STRICT-NEXT:    store x86_fp80 [[TMP21]], ptr [[RETVAL_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[TMP22:%.*]] = load { x86_fp80, x86_fp80 }, ptr [[RETVAL]], align 16
+// PRMTD_STRICT-NEXT:    ret { x86_fp80, x86_fp80 } [[TMP22]]
+//
 _Complex long double divld(_Complex long double a, _Complex long double b) {
   return a / b;
 }
@@ -2659,6 +3092,68 @@ _Complex long double divld(_Complex long double a, _Complex long double b) {
 // PRMTD_FAST-NEXT:    [[TMP0:%.*]] = load { x86_fp80, x86_fp80 }, ptr [[RETVAL]], align 16
 // PRMTD_FAST-NEXT:    ret { x86_fp80, x86_fp80 } [[TMP0]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local void @mulld(
+// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[B]], ptr [[B_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[A]], ptr [[A_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load double, ptr [[A_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_REAL]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[A_IMAG]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[MUL_AC]], double [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[MUL_AD]], double [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[MUL_R]], ptr [[AGG_RESULT_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[MUL_I]], ptr [[AGG_RESULT_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP1:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REAL:%.*]] = load double, ptr [[AGG_RESULT_REALP1]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP2:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAG:%.*]] = load double, ptr [[AGG_RESULT_IMAGP2]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_REALP3:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[AGG_RESULT_IMAGP4:%.*]] = getelementptr inbounds { double, double }, ptr [[AGG_RESULT]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_REAL]], ptr [[AGG_RESULT_REALP3]], align 8
+// X86WINPRMTD_STRICT-NEXT:    store double [[AGG_RESULT_IMAG]], ptr [[AGG_RESULT_IMAGP4]], align 8
+// X86WINPRMTD_STRICT-NEXT:    ret void
+//
+// PRMTD_STRICT-LABEL: define dso_local { x86_fp80, x86_fp80 } @mulld(
+// PRMTD_STRICT-SAME: ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 [[A:%.*]], ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 [[B:%.*]]) #[[ATTR2]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { x86_fp80, x86_fp80 }, align 16
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load x86_fp80, ptr [[A_REALP]], align 16
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load x86_fp80, ptr [[A_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load x86_fp80, ptr [[B_REALP]], align 16
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load x86_fp80, ptr [[B_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[MUL_AC:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_REAL]], x86_fp80 [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BD:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_IMAG]], x86_fp80 [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_AD:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_REAL]], x86_fp80 [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_BC:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[A_IMAG]], x86_fp80 [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_R:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[MUL_AC]], x86_fp80 [[MUL_BD]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[MUL_I:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[MUL_AD]], x86_fp80 [[MUL_BC]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store x86_fp80 [[MUL_R]], ptr [[RETVAL_REALP]], align 16
+// PRMTD_STRICT-NEXT:    store x86_fp80 [[MUL_I]], ptr [[RETVAL_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = load { x86_fp80, x86_fp80 }, ptr [[RETVAL]], align 16
+// PRMTD_STRICT-NEXT:    ret { x86_fp80, x86_fp80 } [[TMP0]]
+//
 _Complex long double mulld(_Complex long double a, _Complex long double b) {
   return a * b;
 }
@@ -3446,6 +3941,167 @@ _Complex long double mulld(_Complex long double a, _Complex long double b) {
 // PRMTD_FAST-NEXT:    [[TMP33:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
 // PRMTD_FAST-NEXT:    ret <2 x float> [[TMP33]]
 //
+// X86WINPRMTD_STRICT-LABEL: define dso_local i64 @f1(
+// X86WINPRMTD_STRICT-SAME: i64 noundef [[A_COERCE:%.*]], ptr noundef [[B:%.*]], i64 noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
+// X86WINPRMTD_STRICT-NEXT:  entry:
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[C:%.*]] = alloca { float, float }, align 4
+// X86WINPRMTD_STRICT-NEXT:    [[B_INDIRECT_ADDR:%.*]] = alloca ptr, align 8
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[A_COERCE]], ptr [[A]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store i64 [[C_COERCE]], ptr [[C]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store ptr [[B]], ptr [[B_INDIRECT_ADDR]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[B]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load double, ptr [[B_IMAGP]], align 8
+// X86WINPRMTD_STRICT-NEXT:    [[C_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[C]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[C_REAL:%.*]] = load float, ptr [[C_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[C_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[C]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[C_IMAG:%.*]] = load float, ptr [[C_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[CONV:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[C_REAL]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[CONV1:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[C_IMAG]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call double @llvm.fabs.f64(double [[CONV]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call double @llvm.fabs.f64(double [[CONV1]]) #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[ABS_CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[TMP0]], double [[TMP1]], metadata !"ugt", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_greater_or_equal_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[CONV1]], double [[CONV]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP2]], double [[CONV1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[CONV]], double [[TMP3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[B_IMAG]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[B_REAL]], double [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP6]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[B_REAL]], double [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[B_IMAG]], double [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP9]], double [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// X86WINPRMTD_STRICT:       abs_rhsr_less_than_abs_rhsi:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[CONV]], double [[CONV1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[TMP11]], double [[CONV]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[CONV1]], double [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[B_REAL]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP15:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP14]], double [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP16:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP15]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP17:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[B_IMAG]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP18:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP17]], double [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP19:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP18]], double [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV]]
+// X86WINPRMTD_STRICT:       complex_div:
+// X86WINPRMTD_STRICT-NEXT:    [[TMP20:%.*]] = phi double [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP21:%.*]] = phi double [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// X86WINPRMTD_STRICT-NEXT:    [[CONV2:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[CONV3:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[EXT:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[CONV2]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[EXT4:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[CONV3]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[EXT5:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_REAL]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[EXT6:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_IMAG]], metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP22:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP23:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT4]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP24:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP22]], double [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP25:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT5]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP26:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT6]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP27:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP25]], double [[TMP26]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT4]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP29:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP30:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP28]], double [[TMP29]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP31:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP24]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[TMP32:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP30]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[UNPROMOTION:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[UNPROMOTION7:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP32]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR3]]
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// X86WINPRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// X86WINPRMTD_STRICT-NEXT:    store float [[UNPROMOTION]], ptr [[RETVAL_REALP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    store float [[UNPROMOTION7]], ptr [[RETVAL_IMAGP]], align 4
+// X86WINPRMTD_STRICT-NEXT:    [[TMP33:%.*]] = load i64, ptr [[RETVAL]], align 4
+// X86WINPRMTD_STRICT-NEXT:    ret i64 [[TMP33]]
+//
+// PRMTD_STRICT-LABEL: define dso_local <2 x float> @f1(
+// PRMTD_STRICT-SAME: <2 x float> noundef [[A_COERCE:%.*]], ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 [[B:%.*]], <2 x float> noundef [[C_COERCE:%.*]]) #[[ATTR0]] {
+// PRMTD_STRICT-NEXT:  entry:
+// PRMTD_STRICT-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    [[C:%.*]] = alloca { float, float }, align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[A_COERCE]], ptr [[A]], align 4
+// PRMTD_STRICT-NEXT:    store <2 x float> [[C_COERCE]], ptr [[C]], align 4
+// PRMTD_STRICT-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[B_REAL:%.*]] = load x86_fp80, ptr [[B_REALP]], align 16
+// PRMTD_STRICT-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { x86_fp80, x86_fp80 }, ptr [[B]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[B_IMAG:%.*]] = load x86_fp80, ptr [[B_IMAGP]], align 16
+// PRMTD_STRICT-NEXT:    [[C_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[C]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[C_REAL:%.*]] = load float, ptr [[C_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[C_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[C]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[C_IMAG:%.*]] = load float, ptr [[C_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[CONV:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f32(float [[C_REAL]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[CONV1:%.*]] = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f32(float [[C_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 [[CONV]]) #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP1:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 [[CONV1]]) #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[ABS_CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f80(x86_fp80 [[TMP0]], x86_fp80 [[TMP1]], metadata !"ugt", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// PRMTD_STRICT:       abs_rhsr_greater_or_equal_abs_rhsi:
+// PRMTD_STRICT-NEXT:    [[TMP2:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[CONV1]], x86_fp80 [[CONV]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP3:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[TMP2]], x86_fp80 [[CONV1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP4:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[CONV]], x86_fp80 [[TMP3]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP5:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[B_IMAG]], x86_fp80 [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP6:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[B_REAL]], x86_fp80 [[TMP5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP7:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP6]], x86_fp80 [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP8:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[B_REAL]], x86_fp80 [[TMP2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP9:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[B_IMAG]], x86_fp80 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP10:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP9]], x86_fp80 [[TMP4]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// PRMTD_STRICT:       abs_rhsr_less_than_abs_rhsi:
+// PRMTD_STRICT-NEXT:    [[TMP11:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[CONV]], x86_fp80 [[CONV1]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP12:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[TMP11]], x86_fp80 [[CONV]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP13:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[CONV1]], x86_fp80 [[TMP12]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP14:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[B_REAL]], x86_fp80 [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP15:%.*]] = call x86_fp80 @llvm.experimental.constrained.fadd.f80(x86_fp80 [[TMP14]], x86_fp80 [[B_IMAG]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP16:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP15]], x86_fp80 [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP17:%.*]] = call x86_fp80 @llvm.experimental.constrained.fmul.f80(x86_fp80 [[B_IMAG]], x86_fp80 [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP18:%.*]] = call x86_fp80 @llvm.experimental.constrained.fsub.f80(x86_fp80 [[TMP17]], x86_fp80 [[B_REAL]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP19:%.*]] = call x86_fp80 @llvm.experimental.constrained.fdiv.f80(x86_fp80 [[TMP18]], x86_fp80 [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    br label [[COMPLEX_DIV]]
+// PRMTD_STRICT:       complex_div:
+// PRMTD_STRICT-NEXT:    [[TMP20:%.*]] = phi x86_fp80 [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// PRMTD_STRICT-NEXT:    [[TMP21:%.*]] = phi x86_fp80 [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// PRMTD_STRICT-NEXT:    [[CONV2:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f80(x86_fp80 [[TMP20]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[CONV3:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f80(x86_fp80 [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[CONV2]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT4:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[CONV3]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// PRMTD_STRICT-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[EXT5:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_REAL]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[EXT6:%.*]] = call double @llvm.experimental.constrained.fpext.f64.f32(float [[A_IMAG]], metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP22:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP23:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT4]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP24:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP22]], double [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP25:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT5]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP26:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT6]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP27:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP25]], double [[TMP26]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT4]], double [[EXT5]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP29:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[EXT]], double [[EXT6]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP30:%.*]] = call double @llvm.experimental.constrained.fsub.f64(double [[TMP28]], double [[TMP29]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP31:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP24]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[TMP32:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[TMP30]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP31]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[UNPROMOTION7:%.*]] = call float @llvm.experimental.constrained.fptrunc.f32.f64(double [[TMP32]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// PRMTD_STRICT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// PRMTD_STRICT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// PRMTD_STRICT-NEXT:    store float [[UNPROMOTION]], ptr [[RETVAL_REALP]], align 4
+// PRMTD_STRICT-NEXT:    store float [[UNPROMOTION7]], ptr [[RETVAL_IMAGP]], align 4
+// PRMTD_STRICT-NEXT:    [[TMP33:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
+// PRMTD_STRICT-NEXT:    ret <2 x float> [[TMP33]]
+//
 _Complex float f1(_Complex float a, _Complex long double b, _Complex float c) {
   return (_Complex float)(b / c) / a;
 }
+//.
+// FULL: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+//.
+// FULL_FAST: [[PROF2]] = !{!"branch_weights", i32 1, i32 1048575}
+//.
-- 
cgit v1.2.3


From 608a663c8ee485c42637d021d554c8d264d556b1 Mon Sep 17 00:00:00 2001
From: Philip Lassen <plassen@groq.com>
Date: Thu, 4 Apr 2024 03:26:33 -0700
Subject: [MLIR] Clean up pass options for test-loop-fusion and
 affine-super-vectorizer-test (#87606)

Before the change `test-loop-fusion` and `affine-super-vectorizer-test`
options were in their own category. This was because they used the
standard llvm command line parsing with `llvm::cl::opt`. This PR moves
them over to the mlir `Pass::Option` class.

Before the change
```
$ mlir-opt --help

...

  General options:
    ...

  Compiler passes to run
      Passes:
         ...
      Pass Pipelines:
        ...
  Generic Options:
       ....

  affine-super-vectorizer-test options:

    --backward-slicing
     ...
    --vectorize-affine-loop-nest

  test-loop-fusion options:

    --test-loop-fusion-dependence-check
   ...
    --test-loop-fusion-transformation
```

After the change

```
$ mlir-opt --help

...

  General options:
    ...

  Compiler passes to run
      Passes:
          ...
          --affine-super-vectorizer-test
             --backward-slicing
                ...
             --vectorize-affine-loop-nest
          ...
          --test-loop-fusion options:
               --test-loop-fusion-dependence-check
                ...
                --test-loop-fusion-transformation
           ...
      Pass Pipelines:
        ...
  Generic Options:
      ...
```

---------

Signed-off-by: philass <plassen@groq.com>
---
 .../Affine/SuperVectorize/compose_maps.mlir        |  2 +-
 .../Affine/SuperVectorize/vector_utils.mlir        |  6 +-
 .../SuperVectorize/vectorize_unsupported.mlir      |  2 +-
 .../Affine/loop-fusion-dependence-check.mlir       |  2 +-
 .../Affine/loop-fusion-slice-computation.mlir      |  2 +-
 .../Dialect/Affine/loop-fusion-transformation.mlir |  2 +-
 mlir/test/Dialect/Affine/slicing-utils.mlir        |  6 +-
 mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp    | 35 ++++++------
 .../lib/Dialect/Affine/TestVectorizationUtils.cpp  | 64 +++++++++++-----------
 9 files changed, 60 insertions(+), 61 deletions(-)

diff --git a/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir b/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
index b53fc55fdac9..d998ed874f41 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -affine-super-vectorizer-test -compose-maps -split-input-file 2>&1 |  FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -affine-super-vectorizer-test=compose-maps -split-input-file 2>&1 |  FileCheck %s
 
 // For all these cases, the test traverses the `test_affine_map` ops and
 // composes them in order one-by-one.
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
index e58de9fdaa76..bd71164244c0 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s -affine-super-vectorizer-test -vector-shape-ratio 4 -vector-shape-ratio 8 2>&1 | FileCheck %s
-// RUN: mlir-opt %s -affine-super-vectorizer-test -vector-shape-ratio 2 -vector-shape-ratio 5 -vector-shape-ratio 2 2>&1 | FileCheck %s -check-prefix=TEST-3x4x5x8
-// RUN: mlir-opt %s -affine-super-vectorizer-test -vectorize-affine-loop-nest 2>&1 | FileCheck %s -check-prefix=VECNEST
+// RUN: mlir-opt %s -affine-super-vectorizer-test="vector-shape-ratio=4,8" 2>&1 | FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorizer-test="vector-shape-ratio=2,5,2" 2>&1 | FileCheck %s -check-prefix=TEST-3x4x5x8
+// RUN: mlir-opt %s -affine-super-vectorizer-test=vectorize-affine-loop-nest 2>&1 | FileCheck %s -check-prefix=VECNEST
 
 func.func @vector_add_2d(%arg0: index, %arg1: index) -> f32 {
   // Nothing should be matched in this first block.
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_unsupported.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_unsupported.mlir
index c117bfc82ca0..6c1a7c48c4cb 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_unsupported.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_unsupported.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -affine-super-vectorizer-test  -vectorize-affine-loop-nest -split-input-file 2>&1 |  FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorizer-test=vectorize-affine-loop-nest -split-input-file 2>&1 |  FileCheck %s
 
 func.func @unparallel_loop_reduction_unsupported(%in: memref<256x512xf32>, %out: memref<256xf32>) {
  // CHECK: Outermost loop cannot be parallel
diff --git a/mlir/test/Dialect/Affine/loop-fusion-dependence-check.mlir b/mlir/test/Dialect/Affine/loop-fusion-dependence-check.mlir
index aa872b0f6911..2c53852a8cec 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-dependence-check.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-dependence-check.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -test-loop-fusion -test-loop-fusion-dependence-check -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -test-loop-fusion=test-loop-fusion-dependence-check -split-input-file -verify-diagnostics | FileCheck %s
 
 // -----
 
diff --git a/mlir/test/Dialect/Affine/loop-fusion-slice-computation.mlir b/mlir/test/Dialect/Affine/loop-fusion-slice-computation.mlir
index c303dd0c01ec..aa79ee26928e 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-slice-computation.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-slice-computation.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-loop-fusion -test-loop-fusion-slice-computation -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s -test-loop-fusion=test-loop-fusion-slice-computation -split-input-file -verify-diagnostics | FileCheck %s
 
 // -----
 
diff --git a/mlir/test/Dialect/Affine/loop-fusion-transformation.mlir b/mlir/test/Dialect/Affine/loop-fusion-transformation.mlir
index c8e0918e6bfd..4f4163a2bbd5 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-transformation.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-transformation.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -allow-unregistered-dialect -test-loop-fusion -test-loop-fusion-transformation -split-input-file -canonicalize | FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -test-loop-fusion=test-loop-fusion-transformation -split-input-file -canonicalize | FileCheck %s
 
 // CHECK-LABEL: func @slice_depth1_loop_nest() {
 func.func @slice_depth1_loop_nest() {
diff --git a/mlir/test/Dialect/Affine/slicing-utils.mlir b/mlir/test/Dialect/Affine/slicing-utils.mlir
index 71bd8ad4ef9d..74379978fdf8 100644
--- a/mlir/test/Dialect/Affine/slicing-utils.mlir
+++ b/mlir/test/Dialect/Affine/slicing-utils.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test -forward-slicing=true 2>&1 | FileCheck %s --check-prefix=FWD
-// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test -backward-slicing=true 2>&1 | FileCheck %s --check-prefix=BWD
-// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test -slicing=true 2>&1 | FileCheck %s --check-prefix=FWDBWD
+// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test="forward-slicing=true" 2>&1 | FileCheck %s --check-prefix=FWD
+// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test="backward-slicing=true" 2>&1 | FileCheck %s --check-prefix=BWD
+// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-super-vectorizer-test="slicing=true" 2>&1 | FileCheck %s --check-prefix=FWDBWD
 
 ///   1       2      3      4
 ///   |_______|      |______|
diff --git a/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp b/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
index f4f1593dc53e..19011803a793 100644
--- a/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestLoopFusion.cpp
@@ -22,23 +22,6 @@
 using namespace mlir;
 using namespace mlir::affine;
 
-static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
-
-static llvm::cl::opt<bool> clTestDependenceCheck(
-    "test-loop-fusion-dependence-check",
-    llvm::cl::desc("Enable testing of loop fusion dependence check"),
-    llvm::cl::cat(clOptionsCategory));
-
-static llvm::cl::opt<bool> clTestSliceComputation(
-    "test-loop-fusion-slice-computation",
-    llvm::cl::desc("Enable testing of loop fusion slice computation"),
-    llvm::cl::cat(clOptionsCategory));
-
-static llvm::cl::opt<bool> clTestLoopFusionTransformation(
-    "test-loop-fusion-transformation",
-    llvm::cl::desc("Enable testing of loop fusion transformation"),
-    llvm::cl::cat(clOptionsCategory));
-
 namespace {
 
 struct TestLoopFusion
@@ -50,6 +33,24 @@ struct TestLoopFusion
     return "Tests loop fusion utility functions.";
   }
   void runOnOperation() override;
+
+  TestLoopFusion() = default;
+  TestLoopFusion(const TestLoopFusion &pass) : PassWrapper(pass){};
+
+  Option<bool> clTestDependenceCheck{
+      *this, "test-loop-fusion-dependence-check",
+      llvm::cl::desc("Enable testing of loop fusion dependence check"),
+      llvm::cl::init(false)};
+
+  Option<bool> clTestSliceComputation{
+      *this, "test-loop-fusion-slice-computation",
+      llvm::cl::desc("Enable testing of loop fusion slice computation"),
+      llvm::cl::init(false)};
+
+  Option<bool> clTestLoopFusionTransformation{
+      *this, "test-loop-fusion-transformation",
+      llvm::cl::desc("Enable testing of loop fusion transformation"),
+      llvm::cl::init(false)};
 };
 
 } // namespace
diff --git a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
index b497f8d75fde..598678f64cb4 100644
--- a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
@@ -37,39 +37,6 @@ using namespace mlir::affine;
 
 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
 
-static llvm::cl::list<int> clTestVectorShapeRatio(
-    "vector-shape-ratio",
-    llvm::cl::desc("Specify the HW vector size for vectorization"),
-    llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestForwardSlicingAnalysis(
-    "forward-slicing",
-    llvm::cl::desc("Enable testing forward static slicing and topological sort "
-                   "functionalities"),
-    llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestBackwardSlicingAnalysis(
-    "backward-slicing",
-    llvm::cl::desc("Enable testing backward static slicing and "
-                   "topological sort functionalities"),
-    llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestSlicingAnalysis(
-    "slicing",
-    llvm::cl::desc("Enable testing static slicing and topological sort "
-                   "functionalities"),
-    llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestComposeMaps(
-    "compose-maps",
-    llvm::cl::desc(
-        "Enable testing the composition of AffineMap where each "
-        "AffineMap in the composition is specified as the affine_map attribute "
-        "in a constant op."),
-    llvm::cl::cat(clOptionsCategory));
-static llvm::cl::opt<bool> clTestVecAffineLoopNest(
-    "vectorize-affine-loop-nest",
-    llvm::cl::desc(
-        "Enable testing for the 'vectorizeAffineLoopNest' utility by "
-        "vectorizing the outermost loops found"),
-    llvm::cl::cat(clOptionsCategory));
-
 namespace {
 struct VectorizerTestPass
     : public PassWrapper<VectorizerTestPass, OperationPass<func::FuncOp>> {
@@ -85,6 +52,37 @@ struct VectorizerTestPass
     return "Tests vectorizer standalone functionality.";
   }
 
+  VectorizerTestPass() = default;
+  VectorizerTestPass(const VectorizerTestPass &pass) : PassWrapper(pass){};
+
+  ListOption<int> clTestVectorShapeRatio{
+      *this, "vector-shape-ratio",
+      llvm::cl::desc("Specify the HW vector size for vectorization")};
+  Option<bool> clTestForwardSlicingAnalysis{
+      *this, "forward-slicing",
+      llvm::cl::desc(
+          "Enable testing forward static slicing and topological sort "
+          "functionalities")};
+  Option<bool> clTestBackwardSlicingAnalysis{
+      *this, "backward-slicing",
+      llvm::cl::desc("Enable testing backward static slicing and "
+                     "topological sort functionalities")};
+  Option<bool> clTestSlicingAnalysis{
+      *this, "slicing",
+      llvm::cl::desc("Enable testing static slicing and topological sort "
+                     "functionalities")};
+  Option<bool> clTestComposeMaps{
+      *this, "compose-maps",
+      llvm::cl::desc("Enable testing the composition of AffineMap where each "
+                     "AffineMap in the composition is specified as the "
+                     "affine_map attribute "
+                     "in a constant op.")};
+  Option<bool> clTestVecAffineLoopNest{
+      *this, "vectorize-affine-loop-nest",
+      llvm::cl::desc(
+          "Enable testing for the 'vectorizeAffineLoopNest' utility by "
+          "vectorizing the outermost loops found")};
+
   void runOnOperation() override;
   void testVectorShapeRatio(llvm::raw_ostream &outs);
   void testForwardSlicing(llvm::raw_ostream &outs);
-- 
cgit v1.2.3


From 4e0b8eae4cb4328f98e6b748c31050a704d378f6 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 29 Mar 2024 20:48:03 +0800
Subject: [RISCV] Add tests for vwsll for extends > .vf2. NFC

These cannot be picked up by TableGen patterns alone and need to be handled
by combineBinOp_VLToVWBinOp_VL
---
 llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll | 256 ++++++++++++++++++++++++++++
 1 file changed, 256 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
index 770bb566c764..082de2e7bf77 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
@@ -627,3 +627,259 @@ define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a) {
   %z = shl <vscale x 8 x i16> %x, splat (i16 2)
   ret <vscale x 8 x i16> %z
 }
+
+; ==============================================================================
+; i8 -> i64
+; ==============================================================================
+
+define <vscale x 2 x i64> @vwsll_vv_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) {
+; CHECK-LABEL: vwsll_vv_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i8> %b to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vv_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) {
+; CHECK-LABEL: vwsll_vv_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i8> %b to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i64_nxv2i64_nxv2i8(<vscale x 2 x i8> %a, i64 %b) {
+; CHECK-LABEL: vwsll_vx_i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsll.vx v8, v10, a0
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i64_nxv2i64_nxv2i8:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %splat
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i32 %b) {
+; CHECK-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf2 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i32> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i32 %b) {
+; CHECK-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf2 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i32> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i16 %b) {
+; CHECK-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf4 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i16> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i16 %b) {
+; CHECK-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf4 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i16> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i8 %b) {
+; CHECK-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i8> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i8 %b) {
+; CHECK-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i8> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vi_nxv2i64_nxv2i8(<vscale x 2 x i8> %a) {
+; CHECK-LABEL: vwsll_vi_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vi_nxv2i64_nxv2i8:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, splat (i64 2)
+  ret <vscale x 2 x i64> %z
+}
-- 
cgit v1.2.3


From 7bd163d0a4b3c9c9375dc32c9c10162433c42180 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 4 Apr 2024 12:05:08 +0100
Subject: [VPlan] Clean up dead recipes after UF & VF specific simplification.

Recursively remove dead recipes after simplifying vector loop exit
branch.
---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp  | 23 ++++++++++++++++++++++
 .../LoopVectorize/AArch64/clamped-trip-count.ll    |  6 ++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3753060cd6ec..7d4e54d09945 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -641,6 +641,25 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
   }
 }
 
+static void recursivelyDeleteDeadRecipes(VPValue *V) {
+  SmallVector<VPValue *> WorkList;
+  SmallPtrSet<VPValue *, 8> Seen;
+  WorkList.push_back(V);
+
+  while (!WorkList.empty()) {
+    VPValue *Cur = WorkList.pop_back_val();
+    if (!Seen.insert(Cur).second)
+      continue;
+    VPRecipeBase *R = Cur->getDefiningRecipe();
+    if (!R)
+      continue;
+    if (!isDeadRecipe(*R))
+      continue;
+    WorkList.append(R->op_begin(), R->op_end());
+    R->eraseFromParent();
+  }
+}
+
 void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
                                          unsigned BestUF,
                                          PredicatedScalarEvolution &PSE) {
@@ -674,7 +693,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
   auto *BOC =
       new VPInstruction(VPInstruction::BranchOnCond,
                         {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))});
+
+  SmallVector<VPValue *> PossiblyDead(Term->operands());
   Term->eraseFromParent();
+  for (VPValue *Op : PossiblyDead)
+    recursivelyDeleteDeadRecipes(Op);
   ExitingVPBB->appendRecipe(BOC);
   Plan.setVF(BestVF);
   Plan.setUF(BestUF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 3e895edcd4f4..afd49aa9f0b8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -43,9 +43,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -135,9 +134,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-- 
cgit v1.2.3


From 8ae9c6259f833dce87f8d29402b1ddced90887b9 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 4 Apr 2024 13:19:43 +0200
Subject: [libc] Fix forward `constexpr` `add_with_carry` / `sub_with_borrow`

Introduced in #87613.
---
 libc/src/__support/math_extras.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/math_extras.h b/libc/src/__support/math_extras.h
index bb6424bfa8e4..4bd871957406 100644
--- a/libc/src/__support/math_extras.h
+++ b/libc/src/__support/math_extras.h
@@ -86,7 +86,7 @@ add_with_carry(T a, T b, T carry_in, T &carry_out) {
     RETURN_IF(unsigned long long, __builtin_addcll)
 #endif
   }
-  T sum;
+  T sum = {};
   T carry1 = add_overflow(a, b, sum);
   T carry2 = add_overflow(sum, carry_in, sum);
   carry_out = carry1 | carry2;
@@ -112,7 +112,7 @@ sub_with_borrow(T a, T b, T carry_in, T &carry_out) {
     RETURN_IF(unsigned long long, __builtin_subcll)
 #endif
   }
-  T sub;
+  T sub = {};
   T carry1 = sub_overflow(a, b, sub);
   T carry2 = sub_overflow(sub, carry_in, sub);
   carry_out = carry1 | carry2;
-- 
cgit v1.2.3


From 1c7fda9f4cde336c4ac30c7478b223536c6eb6d6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 11:49:20 +0100
Subject: Fix MSVC "result of 32-bit shift implicitly converted to 64 bits"
 warning. NFC.

---
 llvm/tools/llvm-readobj/ELFDumper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index d6dda6198017..d3534820c117 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -5157,7 +5157,7 @@ static bool printAArch64PAuthABICoreInfo(raw_ostream &OS, uint32_t DataSize,
 
     std::string Desc;
     for (uint32_t I = 0, End = Flags.size(); I < End; ++I) {
-      if (!(Version & (1 << I)))
+      if (!(Version & (1ULL << I)))
         Desc += '!';
       Desc +=
           Twine("PointerAuth" + Flags[I] + (I == End - 1 ? "" : ", ")).str();
-- 
cgit v1.2.3


From d54d476300d7aaeb6616a11b43e1b8006cad17c9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 12:24:24 +0100
Subject: [SLP] Fix Wunused-variable warning. NFC.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 99769540f780..79d146aec0c5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1973,7 +1973,7 @@ public:
       assert(isa<Instruction>(VL[0]) && "Expected instruction");
       unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
       constexpr unsigned IntrinsicNumOperands = 2;
-      if (auto *CI = dyn_cast<IntrinsicInst>(VL[0]))
+      if (isa<IntrinsicInst>(VL[0]))
         NumOperands = IntrinsicNumOperands;
       OpsVec.resize(NumOperands);
       unsigned NumLanes = VL.size();
-- 
cgit v1.2.3


From 3871eaba6bd016b229f2d0e4b75e2be3b65e39a7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 12:26:25 +0100
Subject: [CostModel][X86] Update AVX1 sext v8i1 -> v8i32 cost based off worst
 case llvm-mca numbers

We were using raw instruction count which overestimated the costs for #67803
---
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 2 +-
 llvm/test/Analysis/CostModel/X86/cast.ll       | 6 +++---
 llvm/test/Analysis/CostModel/X86/extend.ll     | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 2ec29463d2fa..2092675a6924 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2666,7 +2666,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   6 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   7 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
diff --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
index 64ed9bed13f3..d80cb09fd390 100644
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -35,7 +35,7 @@ define i32 @add(i32 %arg) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B = sext <4 x i1> undef to <4 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = zext <8 x i1> undef to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %E = sext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %E = sext <8 x i1> undef to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F = trunc <8 x i32> undef to <8 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1
@@ -143,7 +143,7 @@ define i32 @zext_sext(<8 x i1> %in) {
 ;
 ; AVX1-LABEL: 'zext_sext'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <8 x i1> %in to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = zext <16 x i8> undef to <16 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = sext <16 x i8> undef to <16 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A = sext <8 x i16> undef to <8 x i32>
@@ -343,7 +343,7 @@ define i32 @masks8(<8 x i1> %in) {
 ;
 ; AVX1-LABEL: 'masks8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <8 x i1> %in to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'masks8'
diff --git a/llvm/test/Analysis/CostModel/X86/extend.ll b/llvm/test/Analysis/CostModel/X86/extend.ll
index 01efced3c13b..34fa3c426392 100644
--- a/llvm/test/Analysis/CostModel/X86/extend.ll
+++ b/llvm/test/Analysis/CostModel/X86/extend.ll
@@ -1971,7 +1971,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = sext <32 x i1> undef to <32 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64i32 = sext <64 x i1> undef to <64 x i32>
@@ -2251,7 +2251,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = sext <32 x i1> undef to <32 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64i32 = sext <64 x i1> undef to <64 x i32>
-- 
cgit v1.2.3


From 5ad320abe36357e3290007d3ab353e8637f33720 Mon Sep 17 00:00:00 2001
From: Victor Campos <victor.campos@arm.com>
Date: Thu, 4 Apr 2024 12:44:32 +0100
Subject: [ARM][Thumb2] Mark BTI-clearing instructions as scheduling region
 boundaries (#79173)

Following https://github.com/llvm/llvm-project/pull/68313 this patch
extends the idea to M-profile PACBTI.

The Machine Scheduler can reorder instructions within a scheduling
region depending on the scheduling policy set. If a BTI-clearing
instruction happens to partake in one such region, it might be moved
around, therefore ending up where it shouldn't.

The solution is to mark all BTI-clearing instructions as scheduling
region boundaries. This essentially means that they must not be part of
any scheduling region, and as consequence never get moved:

 - PAC
 - PACBTI
 - BTI
 - SG

Note that PAC isn't BTI-clearing, but it's replaced by PACBTI late in
the compilation pipeline.

As far as I know, currently it isn't possible to organically obtain code
that's susceptible to the bug:

- Instructions that write to SP are region boundaries. PAC seems to
always be followed by the pushing of r12 to the stack, so essentially
PAC is always by itself in a scheduling region.
- CALL_BTI is expanded into a machine instruction bundle. Bundles are
unpacked only after the last machine scheduler run. Thus setjmp and BTI
can be separated only if someone deliberately run the scheduler once
more.
- The BTI insertion pass is run late in the pipeline, only after the
last machine scheduling has run. So once again it can be reordered only
if someone deliberately runs the scheduler again.

Nevertheless, one can reasonably argue that we should prevent the bug in
spite of the compiler not being able to produce the required conditions
for it. If things change, the compiler will be robust against this
issue.

The tests written for this are contrived: bogus MIR instructions have
been added adjacent to the BTI-clearing instructions in order to have
them inside non-trivial scheduling regions.
---
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp          |  19 +++
 llvm/lib/Target/ARM/Thumb2InstrInfo.h            |   4 +
 llvm/test/CodeGen/ARM/misched-branch-targets.mir | 166 +++++++++++++++++++++++
 3 files changed, 189 insertions(+)
 create mode 100644 llvm/test/CodeGen/ARM/misched-branch-targets.mir

diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 083f25f49dec..fc2834cb0b45 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -286,6 +286,25 @@ MachineInstr *Thumb2InstrInfo::commuteInstructionImpl(MachineInstr &MI,
   return ARMBaseInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
 }
 
+bool Thumb2InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                           const MachineBasicBlock *MBB,
+                                           const MachineFunction &MF) const {
+  // BTI clearing instructions shall not take part in scheduling regions as
+  // they must stay in their intended place. Although PAC isn't BTI clearing,
+  // it can be transformed into PACBTI after the pre-RA Machine Scheduling
+  // has taken place, so its movement must also be restricted.
+  switch (MI.getOpcode()) {
+  case ARM::t2BTI:
+  case ARM::t2PAC:
+  case ARM::t2PACBTI:
+  case ARM::t2SG:
+    return true;
+  default:
+    break;
+  }
+  return ARMBaseInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
 void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator &MBBI,
                                   const DebugLoc &dl, Register DestReg,
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 4bb412f09dcb..8915da8c5bf3 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -68,6 +68,10 @@ public:
                                        unsigned OpIdx1,
                                        unsigned OpIdx2) const override;
 
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
 private:
   void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
 };
diff --git a/llvm/test/CodeGen/ARM/misched-branch-targets.mir b/llvm/test/CodeGen/ARM/misched-branch-targets.mir
new file mode 100644
index 000000000000..b071fbd4538a
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/misched-branch-targets.mir
@@ -0,0 +1,166 @@
+# RUN: llc -o - -run-pass=machine-scheduler -misched=shuffle %s | FileCheck %s
+# RUN: llc -o - -run-pass=postmisched %s | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main-arm-none-eabi"
+
+  define i32 @foo_bti() #0 {
+  entry:
+    ret i32 0
+  }
+
+  define i32 @foo_pac() #0 {
+  entry:
+    ret i32 0
+  }
+
+  define i32 @foo_pacbti() #0 {
+  entry:
+    ret i32 0
+  }
+
+  define i32 @foo_setjmp() #0 {
+  entry:
+    ret i32 0
+  if.then:
+    ret i32 0
+  }
+
+  define i32 @foo_sg() #0 {
+  entry:
+    ret i32 0
+  }
+
+  declare i32 @setjmp(ptr noundef) #1
+  declare void @longjmp(ptr noundef, i32 noundef) #2
+
+  attributes #0 = { "frame-pointer"="all" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main" }
+  attributes #1 = { nounwind returns_twice "frame-pointer"="all" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main" }
+  attributes #2 = { noreturn nounwind "frame-pointer"="all" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main" }
+
+...
+---
+name:            foo_bti
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $r0
+
+    t2BTI
+    renamable $r0, dead $cpsr = nsw tADDi8 killed renamable $r0, 1, 14 /* CC::al */, $noreg
+    tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0
+
+...
+
+# CHECK-LABEL: name:            foo_bti
+# CHECK:       body:
+# CHECK-NEXT:   bb.0.entry:
+# CHECK-NEXT:     liveins: $r0
+# CHECK-NEXT:     {{^ +$}}
+# CHECK-NEXT:     t2BTI
+
+---
+name:            foo_pac
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $r0, $lr, $r12
+
+    frame-setup t2PAC implicit-def $r12, implicit $lr, implicit $sp
+    renamable $r2 = nsw t2ADDri $r0, 3, 14 /* CC::al */, $noreg, $noreg
+    $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
+    $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
+    early-clobber $sp = frame-setup t2STR_PRE killed $r12, $sp, -4, 14 /* CC::al */, $noreg
+    $r12, $sp = frame-destroy t2LDR_POST $sp, 4, 14 /* CC::al */, $noreg
+    $sp = frame-destroy t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r7, def $lr
+    t2AUT implicit $r12, implicit $lr, implicit $sp
+    tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+
+...
+
+# CHECK-LABEL: name:            foo_pac
+# CHECK:       body:
+# CHECK-NEXT:    bb.0.entry:
+# CHECK-NEXT:      liveins: $r0, $lr, $r12
+# CHECK-NEXT:      {{^ +$}}
+# CHECK-NEXT:      frame-setup t2PAC implicit-def $r12, implicit $lr, implicit $sp
+
+---
+name:            foo_pacbti
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $r0, $lr, $r12
+
+    frame-setup t2PACBTI implicit-def $r12, implicit $lr, implicit $sp
+    renamable $r2 = nsw t2ADDri $r0, 3, 14 /* CC::al */, $noreg, $noreg
+    $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
+    $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
+    early-clobber $sp = frame-setup t2STR_PRE killed $r12, $sp, -4, 14 /* CC::al */, $noreg
+    $r12, $sp = frame-destroy t2LDR_POST $sp, 4, 14 /* CC::al */, $noreg
+    $sp = frame-destroy t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r7, def $lr
+    t2AUT implicit $r12, implicit $lr, implicit $sp
+    tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+
+...
+
+# CHECK-LABEL: name:            foo_pacbti
+# CHECK:       body:
+# CHECK-NEXT:    bb.0.entry:
+# CHECK-NEXT:      liveins: $r0, $lr, $r12
+# CHECK-NEXT:      {{^ +$}}
+# CHECK-NEXT:      frame-setup t2PACBTI implicit-def $r12, implicit $lr, implicit $sp
+
+---
+name:            foo_setjmp
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1
+    liveins: $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+    $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+    $sp = frame-setup tSUBspi $sp, 40, 14 /* CC::al */, $noreg
+    renamable $r0 = tMOVr $sp, 14 /* CC::al */, $noreg
+    tBL 14 /* CC::al */, $noreg, @setjmp, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit-def $sp, implicit-def $r0
+    t2BTI
+    renamable $r2 = nsw t2ADDri $r0, 3, 14 /* CC::al */, $noreg, $noreg
+    tCMPi8 killed renamable $r0, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 0, 2, implicit-def $itstate
+    renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit $itstate
+    $sp = frame-destroy tADDspi $sp, 40, 0 /* CC::eq */, $cpsr, implicit $itstate
+    frame-destroy tPOP_RET 0 /* CC::eq */, killed $cpsr, def $r7, def $pc, implicit killed $r0, implicit $sp, implicit killed $itstate
+
+  bb.1.if.then:
+    renamable $r0 = tMOVr $sp, 14 /* CC::al */, $noreg
+    renamable $r1, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    tBL 14 /* CC::al */, $noreg, @longjmp, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit killed $r1, implicit-def $sp
+
+...
+
+# CHECK-LABEL: name:            foo_setjmp
+# CHECK:       body:
+# CHECK:         tBL 14 /* CC::al */, $noreg, @setjmp, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit-def $sp, implicit-def $r0
+# CHECK-NEXT:    t2BTI
+
+---
+name:            foo_sg
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $r0
+
+    t2SG 14 /* CC::al */, $noreg
+    renamable $r0, dead $cpsr = nsw tADDi8 killed renamable $r0, 1, 14 /* CC::al */, $noreg
+    tBX_RET 14 /* CC::al */, $noreg, implicit killed $r0
+
+...
+
+# CHECK-LABEL: name:            foo_sg
+# CHECK:       body:
+# CHECK-NEXT:   bb.0.entry:
+# CHECK-NEXT:     liveins: $r0
+# CHECK-NEXT:     {{^ +$}}
+# CHECK-NEXT:     t2SG
-- 
cgit v1.2.3


From a8c59750d911eb30d5664696db19af445dd770f8 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Thu, 4 Apr 2024 13:22:45 +0100
Subject: [libc][math][c23] Add exp2m1f C23 math function (#86996)

Fixes #86502.

cc @lntue
---
 libc/config/linux/x86_64/entrypoints.txt       |   1 +
 libc/docs/math/index.rst                       |   2 +-
 libc/spec/stdc.td                              |   2 +
 libc/src/math/CMakeLists.txt                   |   2 +
 libc/src/math/exp2m1f.h                        |  18 +++
 libc/src/math/generic/CMakeLists.txt           |  21 +++
 libc/src/math/generic/exp2m1f.cpp              | 183 +++++++++++++++++++++++++
 libc/test/src/math/CMakeLists.txt              |  15 ++
 libc/test/src/math/exhaustive/CMakeLists.txt   |  15 ++
 libc/test/src/math/exhaustive/exp2m1f_test.cpp |  33 +++++
 libc/test/src/math/exp2m1f_test.cpp            |  66 +++++++++
 libc/test/src/math/smoke/CMakeLists.txt        |  11 ++
 libc/test/src/math/smoke/exp2m1f_test.cpp      |  63 +++++++++
 libc/utils/MPFRWrapper/MPFRUtils.cpp           |  50 ++++++-
 libc/utils/MPFRWrapper/MPFRUtils.h             |   1 +
 15 files changed, 478 insertions(+), 5 deletions(-)
 create mode 100644 libc/src/math/exp2m1f.h
 create mode 100644 libc/src/math/generic/exp2m1f.cpp
 create mode 100644 libc/test/src/math/exhaustive/exp2m1f_test.cpp
 create mode 100644 libc/test/src/math/exp2m1f_test.cpp
 create mode 100644 libc/test/src/math/smoke/exp2m1f_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index cc7671cab081..2742c33ae478 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -370,6 +370,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
+    libc.src.math.exp2m1f
     libc.src.math.expm1
     libc.src.math.expm1f
     libc.src.math.fabs
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 265261be31a2..970a43ca87c9 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -270,7 +270,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | exp2      | |check|          | |check|         |                        |                      |                        | 7.12.6.4               | F.10.3.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| exp2m1    |                  |                 |                        |                      |                        | 7.12.6.5               | F.10.3.5                   |
+| exp2m1    | |check|          |                 |                        |                      |                        | 7.12.6.5               | F.10.3.5                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | expm1     | |check|          | |check|         |                        |                      |                        | 7.12.6.6               | F.10.3.6                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 719bb9aa18cb..bd62870b07c8 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -535,6 +535,8 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"exp2", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"exp2f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
+          FunctionSpec<"exp2m1f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+
           FunctionSpec<"expm1", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"expm1f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index b67ee3a8e0df..c89792b8ac7b 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -88,6 +88,8 @@ add_math_entrypoint_object(expf)
 add_math_entrypoint_object(exp2)
 add_math_entrypoint_object(exp2f)
 
+add_math_entrypoint_object(exp2m1f)
+
 add_math_entrypoint_object(exp10)
 add_math_entrypoint_object(exp10f)
 
diff --git a/libc/src/math/exp2m1f.h b/libc/src/math/exp2m1f.h
new file mode 100644
index 000000000000..0eaf6b00e958
--- /dev/null
+++ b/libc/src/math/exp2m1f.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for exp2m1f -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_EXP2M1F_H
+#define LLVM_LIBC_SRC_MATH_EXP2M1F_H
+
+namespace LIBC_NAMESPACE {
+
+float exp2m1f(float x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_EXP2M1F_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index b164d33e204b..dc77f8b5ddba 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -837,6 +837,27 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  exp2m1f
+  SRCS
+    exp2m1f.cpp
+  HDRS
+    ../exp2m1f.h
+  DEPENDS
+    .explogxf
+    libc.src.errno.errno
+    libc.src.__support.common
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.cpu_features
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   exp10
   SRCS
diff --git a/libc/src/math/generic/exp2m1f.cpp b/libc/src/math/generic/exp2m1f.cpp
new file mode 100644
index 000000000000..c60930dd782e
--- /dev/null
+++ b/libc/src/math/generic/exp2m1f.cpp
@@ -0,0 +1,183 @@
+//===-- Implementation of exp2m1f function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp2m1f.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+#include "src/errno/libc_errno.h"
+
+#include "explogxf.h"
+
+namespace LIBC_NAMESPACE {
+
+static constexpr size_t N_EXCEPTS_LO = 8;
+
+static constexpr fputil::ExceptValues<float, N_EXCEPTS_LO> EXP2M1F_EXCEPTS_LO =
+    {{
+        // (input, RZ output, RU offset, RD offset, RN offset)
+        // x = 0x1.36dc8ep-36, exp2m1f(x) = 0x1.aef212p-37 (RZ)
+        {0x2d9b'6e47U, 0x2d57'7909U, 1U, 0U, 0U},
+        // x = 0x1.224936p-19, exp2m1f(x) = 0x1.926c0ep-20 (RZ)
+        {0x3611'249bU, 0x35c9'3607U, 1U, 0U, 1U},
+        // x = 0x1.d16d2p-20, exp2m1f(x) = 0x1.429becp-20 (RZ)
+        {0x35e8'b690U, 0x35a1'4df6U, 1U, 0U, 1U},
+        // x = 0x1.17949ep-14, exp2m1f(x) = 0x1.8397p-15 (RZ)
+        {0x388b'ca4fU, 0x3841'cb80U, 1U, 0U, 1U},
+        // x = -0x1.9c3e1ep-38, exp2m1f(x) = -0x1.1dbeacp-38 (RZ)
+        {0xacce'1f0fU, 0xac8e'df56U, 0U, 1U, 0U},
+        // x = -0x1.4d89b4p-32, exp2m1f(x) = -0x1.ce61b6p-33 (RZ)
+        {0xafa6'c4daU, 0xaf67'30dbU, 0U, 1U, 1U},
+        // x = -0x1.a6eac4p-10, exp2m1f(x) = -0x1.24fadap-10 (RZ)
+        {0xbad3'7562U, 0xba92'7d6dU, 0U, 1U, 1U},
+        // x = -0x1.e7526ep-6, exp2m1f(x) = -0x1.4e53dep-6 (RZ)
+        {0xbcf3'a937U, 0xbca7'29efU, 0U, 1U, 1U},
+    }};
+
+static constexpr size_t N_EXCEPTS_HI = 3;
+
+static constexpr fputil::ExceptValues<float, N_EXCEPTS_HI> EXP2M1F_EXCEPTS_HI =
+    {{
+        // (input, RZ output, RU offset, RD offset, RN offset)
+        // x = 0x1.16a972p-1, exp2m1f(x) = 0x1.d545b2p-2 (RZ)
+        {0x3f0b'54b9U, 0x3eea'a2d9U, 1U, 0U, 0U},
+        // x = -0x1.9f12acp-5, exp2m1f(x) = -0x1.1ab68cp-5 (RZ)
+        {0xbd4f'8956U, 0xbd0d'5b46U, 0U, 1U, 0U},
+        // x = -0x1.de7b9cp-5, exp2m1f(x) = -0x1.4508f4p-5 (RZ)
+        {0xbd6f'3dceU, 0xbd22'847aU, 0U, 1U, 1U},
+    }};
+
+LLVM_LIBC_FUNCTION(float, exp2m1f, (float x)) {
+  using FPBits = fputil::FPBits<float>;
+  FPBits xbits(x);
+
+  uint32_t x_u = xbits.uintval();
+  uint32_t x_abs = x_u & 0x7fff'ffffU;
+
+  // When |x| >= 128, or x is nan, or |x| <= 2^-5
+  if (LIBC_UNLIKELY(x_abs >= 0x4300'0000U || x_abs <= 0x3d00'0000U)) {
+    // |x| <= 2^-5
+    if (x_abs <= 0x3d00'0000U) {
+      if (auto r = EXP2M1F_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+
+      // Minimax polynomial generated by Sollya with:
+      // > display = hexadecimal;
+      // > fpminimax((2^x - 1)/x, 5, [|D...|], [-2^-5, 2^-5]);
+      constexpr double COEFFS[] = {
+          0x1.62e42fefa39f3p-1, 0x1.ebfbdff82c57bp-3,  0x1.c6b08d6f2d7aap-5,
+          0x1.3b2ab6fc92f5dp-7, 0x1.5d897cfe27125p-10, 0x1.43090e61e6af1p-13};
+      double xd = x;
+      double xsq = xd * xd;
+      double c0 = fputil::multiply_add(xd, COEFFS[1], COEFFS[0]);
+      double c1 = fputil::multiply_add(xd, COEFFS[3], COEFFS[2]);
+      double c2 = fputil::multiply_add(xd, COEFFS[5], COEFFS[4]);
+      double p = fputil::polyeval(xsq, c0, c1, c2);
+      return static_cast<float>(p * xd);
+    }
+
+    // x >= 128, or x is nan
+    if (xbits.is_pos()) {
+      if (xbits.is_finite()) {
+        int rounding = fputil::quick_get_round();
+        if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+          return FPBits::max_normal().get_val();
+
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW);
+      }
+
+      // x >= 128 and 2^x - 1 rounds to +inf, or x is +inf or nan
+      return x + FPBits::inf().get_val();
+    }
+  }
+
+  if (LIBC_UNLIKELY(x <= -25.0f)) {
+    // 2^(-inf) - 1 = -1
+    if (xbits.is_inf())
+      return -1.0f;
+    // 2^nan - 1 = nan
+    if (xbits.is_nan())
+      return x;
+
+    int rounding = fputil::quick_get_round();
+    if (rounding == FE_UPWARD || rounding == FE_TOWARDZERO)
+      return -0x1.ffff'fep-1f; // -1.0f + 0x1.0p-24f
+
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_UNDERFLOW);
+    return -1.0f;
+  }
+
+  if (auto r = EXP2M1F_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+
+  // For -25 < x < 128, to compute 2^x, we perform the following range
+  // reduction: find hi, mid, lo such that:
+  //   x = hi + mid + lo, in which:
+  //     hi is an integer,
+  //     0 <= mid * 2^5 < 32 is an integer,
+  //     -2^(-6) <= lo <= 2^(-6).
+  // In particular,
+  //   hi + mid = round(x * 2^5) * 2^(-5).
+  // Then,
+  //   2^x = 2^(hi + mid + lo) = 2^hi * 2^mid * 2^lo.
+  // 2^mid is stored in the lookup table of 32 elements.
+  // 2^lo is computed using a degree-4 minimax polynomial generated by Sollya.
+  // We perform 2^hi * 2^mid by simply add hi to the exponent field of 2^mid.
+
+  // kf = (hi + mid) * 2^5 = round(x * 2^5)
+  float kf;
+  int k;
+#ifdef LIBC_TARGET_CPU_HAS_NEAREST_INT
+  kf = fputil::nearest_integer(x * 32.0f);
+  k = static_cast<int>(kf);
+#else
+  constexpr float HALF[2] = {0.5f, -0.5f};
+  k = static_cast<int>(fputil::multiply_add(x, 32.0f, HALF[x < 0.0f]));
+  kf = static_cast<float>(k);
+#endif // LIBC_TARGET_CPU_HAS_NEAREST_INT
+
+  // lo = x - (hi + mid) = x - kf * 2^(-5)
+  double lo = fputil::multiply_add(-0x1.0p-5f, kf, x);
+
+  // hi = floor(kf * 2^(-4))
+  // exp2_hi = shift hi to the exponent field of double precision.
+  int64_t exp2_hi =
+      static_cast<int64_t>(static_cast<uint64_t>(k >> ExpBase::MID_BITS)
+                           << fputil::FPBits<double>::FRACTION_LEN);
+  // mh = 2^hi * 2^mid
+  // mh_bits = bit field of mh
+  int64_t mh_bits = ExpBase::EXP_2_MID[k & ExpBase::MID_MASK] + exp2_hi;
+  double mh = fputil::FPBits<double>(static_cast<uint64_t>(mh_bits)).get_val();
+
+  // Degree-4 polynomial approximating (2^x - 1)/x generated by Sollya with:
+  // > display = hexadecimal;
+  // > fpminimax((2^x - 1)/x, 4, [|D...|], [-2^-6, 2^-6]);
+  constexpr double COEFFS[5] = {0x1.62e42fefa39efp-1, 0x1.ebfbdff8131c4p-3,
+                                0x1.c6b08d7061695p-5, 0x1.3b2b1bee74b2ap-7,
+                                0x1.5d88091198529p-10};
+  double lo_sq = lo * lo;
+  double c1 = fputil::multiply_add(lo, COEFFS[0], 1.0);
+  double c2 = fputil::multiply_add(lo, COEFFS[2], COEFFS[1]);
+  double c3 = fputil::multiply_add(lo, COEFFS[4], COEFFS[3]);
+  double exp2_lo = fputil::polyeval(lo_sq, c1, c2, c3);
+  // 2^x - 1 = 2^(hi + mid + lo) - 1
+  //         = 2^(hi + mid) * 2^lo - 1
+  //         ~ mh * (1 + lo * P(lo)) - 1
+  //         = mh * exp2_lo - 1
+  return static_cast<float>(fputil::multiply_add(exp2_lo, mh, -1.0));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index f8f0f8ba7b6f..bbf8f071e1e0 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -637,6 +637,21 @@ add_fp_unittest(
    libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  exp2m1f_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    exp2m1f_test.cpp
+  DEPENDS
+    libc.include.llvm-libc-macros.math_macros
+    libc.src.errno.errno
+    libc.src.math.exp2m1f
+    libc.src.__support.CPP.array
+    libc.src.__support.FPUtil.fp_bits
+)
+
 add_fp_unittest(
   exp10f_test
   NEED_MPFR
diff --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt
index df32dd4f943f..6b2f3dddcadd 100644
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@@ -142,6 +142,21 @@ add_fp_unittest(
     -lpthread
 )
 
+add_fp_unittest(
+  exp2m1f_test
+  NO_RUN_POSTBUILD
+  NEED_MPFR
+  SUITE
+    libc_math_exhaustive_tests
+  SRCS
+    exp2m1f_test.cpp
+  DEPENDS
+    .exhaustive_test
+    libc.src.math.exp2m1f
+  LINK_LIBRARIES
+    -lpthread
+)
+
 add_fp_unittest(
   exp10f_test
   NO_RUN_POSTBUILD
diff --git a/libc/test/src/math/exhaustive/exp2m1f_test.cpp b/libc/test/src/math/exhaustive/exp2m1f_test.cpp
new file mode 100644
index 000000000000..2111024cb5c0
--- /dev/null
+++ b/libc/test/src/math/exhaustive/exp2m1f_test.cpp
@@ -0,0 +1,33 @@
+//===-- Exhaustive test for exp2m1f ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "exhaustive_test.h"
+#include "src/math/exp2m1f.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+using LlvmLibcExp2m1fExhaustiveTest =
+    LlvmLibcUnaryOpExhaustiveMathTest<float, mpfr::Operation::Exp2m1,
+                                      LIBC_NAMESPACE::exp2m1f>;
+
+// Range: [0, Inf];
+static constexpr uint32_t POS_START = 0x0000'0000U;
+static constexpr uint32_t POS_STOP = 0x7f80'0000U;
+
+TEST_F(LlvmLibcExp2m1fExhaustiveTest, PostiveRange) {
+  test_full_range_all_roundings(POS_START, POS_STOP);
+}
+
+// Range: [-Inf, 0];
+static constexpr uint32_t NEG_START = 0x8000'0000U;
+static constexpr uint32_t NEG_STOP = 0xff80'0000U;
+
+TEST_F(LlvmLibcExp2m1fExhaustiveTest, NegativeRange) {
+  test_full_range_all_roundings(NEG_START, NEG_STOP);
+}
diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp
new file mode 100644
index 000000000000..a0f0da868117
--- /dev/null
+++ b/libc/test/src/math/exp2m1f_test.cpp
@@ -0,0 +1,66 @@
+//===-- Unittests for exp2m1f ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/llvm-libc-macros/math-macros.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/errno/libc_errno.h"
+#include "src/math/exp2m1f.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+#include <stdint.h>
+
+using LlvmLibcExp2m1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+TEST_F(LlvmLibcExp2m1fTest, TrickyInputs) {
+  constexpr LIBC_NAMESPACE::cpp::array<float, 10> INPUTS = {
+      // EXP2M1F_EXCEPTS_LO
+      0x1.36dc8ep-36,
+      0x1.224936p-19,
+      0x1.d16d2p-20,
+      0x1.17949ep-14,
+      -0x1.9c3e1ep-38,
+      -0x1.4d89b4p-32,
+      -0x1.a6eac4p-10,
+      -0x1.e7526ep-6,
+      // EXP2M1F_EXCEPTS_HI
+      0x1.16a972p-1,
+      -0x1.9f12acp-5,
+  };
+
+  for (float x : INPUTS) {
+    LIBC_NAMESPACE::libc_errno = 0;
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
+                                   LIBC_NAMESPACE::exp2m1f(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcExp2m1fTest, InFloatRange) {
+  constexpr uint32_t COUNT = 100'000;
+  constexpr uint32_t STEP = UINT32_MAX / COUNT;
+  for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
+    float x = FPBits(v).get_val();
+    if (isnan(x) || isinf(x))
+      continue;
+    LIBC_NAMESPACE::libc_errno = 0;
+    float result = LIBC_NAMESPACE::exp2m1f(x);
+
+    // If the computation resulted in an error or did not produce valid result
+    // in the single-precision floating point range, then ignore comparing with
+    // MPFR result as MPFR can still produce valid results because of its
+    // wider precision.
+    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+      continue;
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
+                                   LIBC_NAMESPACE::exp2m1f(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 5d269ddb229c..4ac1842cf5fa 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -773,6 +773,17 @@ add_fp_unittest(
    libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  exp2m1f_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    exp2m1f_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.exp2m1f
+)
+
 add_fp_unittest(
   exp10f_test
   SUITE
diff --git a/libc/test/src/math/smoke/exp2m1f_test.cpp b/libc/test/src/math/smoke/exp2m1f_test.cpp
new file mode 100644
index 000000000000..2df435385247
--- /dev/null
+++ b/libc/test/src/math/smoke/exp2m1f_test.cpp
@@ -0,0 +1,63 @@
+//===-- Unittests for exp2m1f ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/errno/libc_errno.h"
+#include "src/math/exp2m1f.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcExp2m1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+using LIBC_NAMESPACE::fputil::testing::ForceRoundingMode;
+using LIBC_NAMESPACE::fputil::testing::RoundingMode;
+
+TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2m1f(aNaN));
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::exp2m1f(inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(-1.0f, LIBC_NAMESPACE::exp2m1f(neg_inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(0.0f, LIBC_NAMESPACE::exp2m1f(0.0f));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, LIBC_NAMESPACE::exp2m1f(-0.0f));
+
+  EXPECT_FP_EQ_ALL_ROUNDING(1.0f, LIBC_NAMESPACE::exp2m1f(1.0f));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0.5f, LIBC_NAMESPACE::exp2m1f(-1.0f));
+  EXPECT_FP_EQ_ALL_ROUNDING(3.0f, LIBC_NAMESPACE::exp2m1f(2.0f));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0.75f, LIBC_NAMESPACE::exp2m1f(-2.0f));
+}
+
+TEST_F(LlvmLibcExp2m1fTest, Overflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f(0x1.fffffep+127),
+                              FE_OVERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f(128.0f),
+                              FE_OVERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f(0x1.000002p+7),
+                              FE_OVERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+}
+
+TEST_F(LlvmLibcExp2m1fTest, Underflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp2m1f(-0x1.fffffep+127),
+                              FE_UNDERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp2m1f(-25.0f),
+                              FE_UNDERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp2m1f(-0x1.900002p4),
+                              FE_UNDERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+}
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index a83f7a7ceb92..eaa47da6bda2 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -89,7 +89,8 @@ public:
   // precision.
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<float, XType>, int> = 0>
-  explicit MPFRNumber(XType x, int precision = ExtraPrecision<XType>::VALUE,
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<XType>::VALUE,
                       RoundingMode rounding = RoundingMode::Nearest)
       : mpfr_precision(precision),
         mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
@@ -99,7 +100,8 @@ public:
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<double, XType>, int> = 0>
-  explicit MPFRNumber(XType x, int precision = ExtraPrecision<XType>::VALUE,
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<XType>::VALUE,
                       RoundingMode rounding = RoundingMode::Nearest)
       : mpfr_precision(precision),
         mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
@@ -109,7 +111,8 @@ public:
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<long double, XType>, int> = 0>
-  explicit MPFRNumber(XType x, int precision = ExtraPrecision<XType>::VALUE,
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<XType>::VALUE,
                       RoundingMode rounding = RoundingMode::Nearest)
       : mpfr_precision(precision),
         mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
@@ -119,7 +122,8 @@ public:
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_integral_v<XType>, int> = 0>
-  explicit MPFRNumber(XType x, int precision = ExtraPrecision<float>::VALUE,
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<float>::VALUE,
                       RoundingMode rounding = RoundingMode::Nearest)
       : mpfr_precision(precision),
         mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
@@ -134,6 +138,12 @@ public:
     mpfr_set(value, other.value, mpfr_rounding);
   }
 
+  MPFRNumber(const MPFRNumber &other, unsigned int precision)
+      : mpfr_precision(precision), mpfr_rounding(other.mpfr_rounding) {
+    mpfr_init2(value, mpfr_precision);
+    mpfr_set(value, other.value, mpfr_rounding);
+  }
+
   ~MPFRNumber() { mpfr_clear(value); }
 
   MPFRNumber &operator=(const MPFRNumber &rhs) {
@@ -229,6 +239,36 @@ public:
     return result;
   }
 
+  MPFRNumber exp2m1() const {
+    // TODO: Only use mpfr_exp2m1 once CI and buildbots get MPFR >= 4.2.0.
+#if MPFR_VERSION_MAJOR > 4 ||                                                  \
+    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+    MPFRNumber result(*this);
+    mpfr_exp2m1(result.value, value, mpfr_rounding);
+    return result;
+#else
+    unsigned int prec = mpfr_precision * 3;
+    MPFRNumber result(*this, prec);
+
+    float f = mpfr_get_flt(abs().value, mpfr_rounding);
+    if (f > 0.5f && f < 0x1.0p30f) {
+      mpfr_exp2(result.value, value, mpfr_rounding);
+      mpfr_sub_ui(result.value, result.value, 1, mpfr_rounding);
+      return result;
+    }
+
+    MPFRNumber ln2(2.0f, prec);
+    // log(2)
+    mpfr_log(ln2.value, ln2.value, mpfr_rounding);
+    // x * log(2)
+    mpfr_mul(result.value, value, ln2.value, mpfr_rounding);
+    // e^(x * log(2)) - 1
+    int ex = mpfr_expm1(result.value, result.value, mpfr_rounding);
+    mpfr_subnormalize(result.value, ex, mpfr_rounding);
+    return result;
+#endif
+  }
+
   MPFRNumber exp10() const {
     MPFRNumber result(*this);
     mpfr_exp10(result.value, value, mpfr_rounding);
@@ -570,6 +610,8 @@ unary_operation(Operation op, InputType input, unsigned int precision,
     return mpfrInput.exp();
   case Operation::Exp2:
     return mpfrInput.exp2();
+  case Operation::Exp2m1:
+    return mpfrInput.exp2m1();
   case Operation::Exp10:
     return mpfrInput.exp10();
   case Operation::Expm1:
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index d5ff590cd7bb..0a41ac639798 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -37,6 +37,7 @@ enum class Operation : int {
   Erf,
   Exp,
   Exp2,
+  Exp2m1,
   Exp10,
   Expm1,
   Floor,
-- 
cgit v1.2.3


From ff56584ee9c8a6d5430c0ba461540ccb6696ebc6 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Thu, 4 Apr 2024 14:25:54 +0200
Subject: [LLD][COFF] Use getMachineArchType in LinkerDriver::getArch. (#87499)

Adds support for ARM64EC, which should use the same search paths as
ARM64.

It's similar to #87370 and #87495. The test is based on the existing x86
test. Generally ARM64EC libraries are shipped together with native ARM64
libraries (using ECSYMBOLS section mechanism).

getMachineArchType uses Triple::thumb, while the existing
implementation uses Triple::arm. It's ultimately passed to
MSVCPaths.cpp functions, so modify them to accept both forms.
---
 lld/COFF/Driver.cpp                      | 13 +------------
 lld/test/COFF/print-search-paths-arm64.s | 24 ++++++++++++++++++++++++
 llvm/lib/WindowsDriver/MSVCPaths.cpp     |  4 ++++
 3 files changed, 29 insertions(+), 12 deletions(-)
 create mode 100644 lld/test/COFF/print-search-paths-arm64.s

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 2b1d4abb6ed0..ea37f8deecf0 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -157,18 +157,7 @@ StringRef LinkerDriver::mangle(StringRef sym) {
 }
 
 llvm::Triple::ArchType LinkerDriver::getArch() {
-  switch (ctx.config.machine) {
-  case I386:
-    return llvm::Triple::ArchType::x86;
-  case AMD64:
-    return llvm::Triple::ArchType::x86_64;
-  case ARMNT:
-    return llvm::Triple::ArchType::arm;
-  case ARM64:
-    return llvm::Triple::ArchType::aarch64;
-  default:
-    return llvm::Triple::ArchType::UnknownArch;
-  }
+  return getMachineArchType(ctx.config.machine);
 }
 
 bool LinkerDriver::findUnderscoreMangle(StringRef sym) {
diff --git a/lld/test/COFF/print-search-paths-arm64.s b/lld/test/COFF/print-search-paths-arm64.s
new file mode 100644
index 000000000000..fb5c8897a165
--- /dev/null
+++ b/lld/test/COFF/print-search-paths-arm64.s
@@ -0,0 +1,24 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -triple aarch64-windows-msvc %s -filetype=obj -o %t.aarch64.obj
+# RUN: lld-link -dll -noentry -winsysroot:%t.dir/sysroot -vctoolsversion:1.1.1.1 -winsdkversion:10.0.1 -libpath:custom-dir \
+# RUN:           %t.aarch64.obj -print-search-paths | FileCheck -DSYSROOT=%t.dir %s
+
+# RUN: llvm-mc -triple arm64ec-windows-msvc %s -filetype=obj -o %t.arm64ec.obj
+# RUN: lld-link -dll -noentry -winsysroot:%t.dir/sysroot -vctoolsversion:1.1.1.1 -winsdkversion:10.0.1 -libpath:custom-dir \
+# RUN:           %t.arm64ec.obj -print-search-paths -machine:arm64ec | FileCheck -DSYSROOT=%t.dir %s
+
+# CHECK: Library search paths:
+# CHECK-NEXT:   (cwd)
+# CHECK-NEXT:   custom-dir
+# CHECK-NEXT:   [[CPATH:.*]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib{{[/\\]}}windows
+# CHECK-NEXT:   [[CPATH]]lib{{[/\\]}}clang{{[/\\]}}{{[0-9]+}}{{[/\\]}}lib
+# CHECK-NEXT:   [[CPATH]]lib
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}DIA SDK{{[/\\]}}lib{{[/\\]}}arm64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}lib{{[/\\]}}arm64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}VC{{[/\\]}}Tools{{[/\\]}}MSVC{{[/\\]}}1.1.1.1{{[/\\]}}atlmfc{{[/\\]}}lib{{[/\\]}}arm64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}ucrt{{[/\\]}}arm64
+# CHECK-NEXT:   [[SYSROOT]]{{[/\\]}}sysroot{{[/\\]}}Windows Kits{{[/\\]}}10{{[/\\]}}Lib{{[/\\]}}10.0.1{{[/\\]}}um{{[/\\]}}arm64
+
+        .data
+        .word 1
diff --git a/llvm/lib/WindowsDriver/MSVCPaths.cpp b/llvm/lib/WindowsDriver/MSVCPaths.cpp
index 634cfcb15f1d..a7bffbb20eba 100644
--- a/llvm/lib/WindowsDriver/MSVCPaths.cpp
+++ b/llvm/lib/WindowsDriver/MSVCPaths.cpp
@@ -268,6 +268,7 @@ const char *archToWindowsSDKArch(Triple::ArchType Arch) {
   case Triple::ArchType::x86_64:
     return "x64";
   case Triple::ArchType::arm:
+  case Triple::ArchType::thumb:
     return "arm";
   case Triple::ArchType::aarch64:
     return "arm64";
@@ -285,6 +286,7 @@ const char *archToLegacyVCArch(Triple::ArchType Arch) {
   case Triple::ArchType::x86_64:
     return "amd64";
   case Triple::ArchType::arm:
+  case Triple::ArchType::thumb:
     return "arm";
   case Triple::ArchType::aarch64:
     return "arm64";
@@ -300,6 +302,7 @@ const char *archToDevDivInternalArch(Triple::ArchType Arch) {
   case Triple::ArchType::x86_64:
     return "amd64";
   case Triple::ArchType::arm:
+  case Triple::ArchType::thumb:
     return "arm";
   case Triple::ArchType::aarch64:
     return "arm64";
@@ -321,6 +324,7 @@ bool appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath,
       sys::path::append(LibPath, "x64");
       break;
     case Triple::arm:
+    case Triple::thumb:
       // It is not necessary to link against Windows SDK 7.x when targeting ARM.
       return false;
     default:
-- 
cgit v1.2.3


From 22089ae6c591d11143724b4bde418aa067958a8f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 4 Apr 2024 14:39:02 +0200
Subject: Revert "[flang][runtime] Enable I/O APIs in F18 runtime offload
 builds." (#87629)

Reverts llvm/llvm-project#87543

The pre-merge Windows build is broken.
---
 flang/include/flang/Runtime/io-api.h | 164 ++++++++++++++--------------
 flang/runtime/environment.cpp        |   2 -
 flang/runtime/environment.h          |   2 +-
 flang/runtime/freestanding-tools.h   |  19 ----
 flang/runtime/io-api.cpp             | 204 ++++++++++++++++++-----------------
 flang/runtime/io-error.cpp           |   9 +-
 flang/runtime/io-error.h             |   2 +-
 flang/runtime/namelist.cpp           |  46 ++++----
 8 files changed, 213 insertions(+), 235 deletions(-)

diff --git a/flang/include/flang/Runtime/io-api.h b/flang/include/flang/Runtime/io-api.h
index 328afc715a3f..1b6c4f5d6a65 100644
--- a/flang/include/flang/Runtime/io-api.h
+++ b/flang/include/flang/Runtime/io-api.h
@@ -92,18 +92,18 @@ constexpr std::size_t RecommendedInternalIoScratchAreaBytes(
 
 // Internal I/O to/from character arrays &/or non-default-kind character
 // requires a descriptor, which is copied.
-Cookie IODECL(BeginInternalArrayListOutput)(const Descriptor &,
+Cookie IONAME(BeginInternalArrayListOutput)(const Descriptor &,
     void **scratchArea = nullptr, std::size_t scratchBytes = 0,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginInternalArrayListInput)(const Descriptor &,
+Cookie IONAME(BeginInternalArrayListInput)(const Descriptor &,
     void **scratchArea = nullptr, std::size_t scratchBytes = 0,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginInternalArrayFormattedOutput)(const Descriptor &,
+Cookie IONAME(BeginInternalArrayFormattedOutput)(const Descriptor &,
     const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor = nullptr, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginInternalArrayFormattedInput)(const Descriptor &,
+Cookie IONAME(BeginInternalArrayFormattedInput)(const Descriptor &,
     const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor = nullptr, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
@@ -111,20 +111,20 @@ Cookie IODECL(BeginInternalArrayFormattedInput)(const Descriptor &,
 
 // Internal I/O to/from a default-kind character scalar can avoid a
 // descriptor.
-Cookie IODECL(BeginInternalListOutput)(char *internal,
+Cookie IONAME(BeginInternalListOutput)(char *internal,
     std::size_t internalLength, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginInternalListInput)(const char *internal,
+Cookie IONAME(BeginInternalListInput)(const char *internal,
     std::size_t internalLength, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginInternalFormattedOutput)(char *internal,
+Cookie IONAME(BeginInternalFormattedOutput)(char *internal,
     std::size_t internalLength, const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor = nullptr, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginInternalFormattedInput)(const char *internal,
+Cookie IONAME(BeginInternalFormattedInput)(const char *internal,
     std::size_t internalLength, const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor = nullptr, void **scratchArea = nullptr,
     std::size_t scratchBytes = 0, const char *sourceFile = nullptr,
@@ -139,63 +139,63 @@ Cookie IODECL(BeginInternalFormattedInput)(const char *internal,
 // If handleError is false, and the unit number is out of range, the program
 // will be terminated. Otherwise, if unit is out of range, a nonzero Iostat
 // code is returned and ioMsg is set if it is not a nullptr.
-enum Iostat IODECL(CheckUnitNumberInRange64)(std::int64_t unit,
+enum Iostat IONAME(CheckUnitNumberInRange64)(std::int64_t unit,
     bool handleError, char *ioMsg = nullptr, std::size_t ioMsgLength = 0,
     const char *sourceFile = nullptr, int sourceLine = 0);
-enum Iostat IODECL(CheckUnitNumberInRange128)(common::int128_t unit,
+enum Iostat IONAME(CheckUnitNumberInRange128)(common::int128_t unit,
     bool handleError, char *ioMsg = nullptr, std::size_t ioMsgLength = 0,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // External synchronous I/O initiation
 Cookie IODECL(BeginExternalListOutput)(ExternalUnit = DefaultOutputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginExternalListInput)(ExternalUnit = DefaultInputUnit,
+Cookie IONAME(BeginExternalListInput)(ExternalUnit = DefaultInputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginExternalFormattedOutput)(const char *format, std::size_t,
+Cookie IONAME(BeginExternalFormattedOutput)(const char *format, std::size_t,
     const Descriptor *formatDescriptor = nullptr,
     ExternalUnit = DefaultOutputUnit, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginExternalFormattedInput)(const char *format, std::size_t,
+Cookie IONAME(BeginExternalFormattedInput)(const char *format, std::size_t,
     const Descriptor *formatDescriptor = nullptr,
     ExternalUnit = DefaultInputUnit, const char *sourceFile = nullptr,
     int sourceLine = 0);
-Cookie IODECL(BeginUnformattedOutput)(ExternalUnit = DefaultOutputUnit,
+Cookie IONAME(BeginUnformattedOutput)(ExternalUnit = DefaultOutputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginUnformattedInput)(ExternalUnit = DefaultInputUnit,
+Cookie IONAME(BeginUnformattedInput)(ExternalUnit = DefaultInputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // WAIT(ID=)
-Cookie IODECL(BeginWait)(ExternalUnit, AsynchronousId,
+Cookie IONAME(BeginWait)(ExternalUnit, AsynchronousId,
     const char *sourceFile = nullptr, int sourceLine = 0);
 // WAIT(no ID=)
-Cookie IODECL(BeginWaitAll)(
+Cookie IONAME(BeginWaitAll)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
 
 // Other I/O statements
-Cookie IODECL(BeginClose)(
+Cookie IONAME(BeginClose)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginFlush)(
+Cookie IONAME(BeginFlush)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginBackspace)(
+Cookie IONAME(BeginBackspace)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginEndfile)(
+Cookie IONAME(BeginEndfile)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginRewind)(
+Cookie IONAME(BeginRewind)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
 
 // OPEN(UNIT=) and OPEN(NEWUNIT=) have distinct interfaces.
-Cookie IODECL(BeginOpenUnit)(
+Cookie IONAME(BeginOpenUnit)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginOpenNewUnit)(
+Cookie IONAME(BeginOpenNewUnit)(
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // The variant forms of INQUIRE() statements have distinct interfaces.
 // BeginInquireIoLength() is basically a no-op output statement.
-Cookie IODECL(BeginInquireUnit)(
+Cookie IONAME(BeginInquireUnit)(
     ExternalUnit, const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginInquireFile)(const char *, std::size_t,
+Cookie IONAME(BeginInquireFile)(const char *, std::size_t,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IODECL(BeginInquireIoLength)(
+Cookie IONAME(BeginInquireIoLength)(
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // If an I/O statement has any IOSTAT=, ERR=, END=, or EOR= specifiers,
@@ -214,33 +214,33 @@ Cookie IODECL(BeginInquireIoLength)(
 //     }
 //   }
 //   if (EndIoStatement(cookie) == FORTRAN_RUTIME_IOSTAT_END) goto label666;
-void IODECL(EnableHandlers)(Cookie, bool hasIoStat = false, bool hasErr = false,
+void IONAME(EnableHandlers)(Cookie, bool hasIoStat = false, bool hasErr = false,
     bool hasEnd = false, bool hasEor = false, bool hasIoMsg = false);
 
 // ASYNCHRONOUS='YES' or 'NO' on READ/WRITE/OPEN
 // Use GetAsynchronousId() to handle ID=.
-bool IODECL(SetAsynchronous)(Cookie, const char *, std::size_t);
+bool IONAME(SetAsynchronous)(Cookie, const char *, std::size_t);
 
 // Control list options.  These return false on a error that the
 // Begin...() call has specified will be handled by the caller.
 // The interfaces that pass a default-kind CHARACTER argument
 // are limited to passing specific case-insensitive keyword values.
 // ADVANCE=YES, NO
-bool IODECL(SetAdvance)(Cookie, const char *, std::size_t);
+bool IONAME(SetAdvance)(Cookie, const char *, std::size_t);
 // BLANK=NULL, ZERO
-bool IODECL(SetBlank)(Cookie, const char *, std::size_t);
+bool IONAME(SetBlank)(Cookie, const char *, std::size_t);
 // DECIMAL=COMMA, POINT
-bool IODECL(SetDecimal)(Cookie, const char *, std::size_t);
+bool IONAME(SetDecimal)(Cookie, const char *, std::size_t);
 // DELIM=APOSTROPHE, QUOTE, NONE
-bool IODECL(SetDelim)(Cookie, const char *, std::size_t);
+bool IONAME(SetDelim)(Cookie, const char *, std::size_t);
 // PAD=YES, NO
-bool IODECL(SetPad)(Cookie, const char *, std::size_t);
-bool IODECL(SetPos)(Cookie, std::int64_t);
-bool IODECL(SetRec)(Cookie, std::int64_t);
+bool IONAME(SetPad)(Cookie, const char *, std::size_t);
+bool IONAME(SetPos)(Cookie, std::int64_t);
+bool IONAME(SetRec)(Cookie, std::int64_t);
 // ROUND=UP, DOWN, ZERO, NEAREST, COMPATIBLE, PROCESSOR_DEFINED
-bool IODECL(SetRound)(Cookie, const char *, std::size_t);
+bool IONAME(SetRound)(Cookie, const char *, std::size_t);
 // SIGN=PLUS, SUPPRESS, PROCESSOR_DEFINED
-bool IODECL(SetSign)(Cookie, const char *, std::size_t);
+bool IONAME(SetSign)(Cookie, const char *, std::size_t);
 
 // Data item transfer for modes other than NAMELIST:
 // Any data object that can be passed as an actual argument without the
@@ -256,34 +256,34 @@ bool IODECL(SetSign)(Cookie, const char *, std::size_t);
 // Once the statement has encountered an error, all following items will be
 // ignored and also return false; but compiled code should check for errors
 // and avoid the following items when they might crash.
-bool IODECL(OutputDescriptor)(Cookie, const Descriptor &);
-bool IODECL(InputDescriptor)(Cookie, const Descriptor &);
+bool IONAME(OutputDescriptor)(Cookie, const Descriptor &);
+bool IONAME(InputDescriptor)(Cookie, const Descriptor &);
 // Formatted (including list directed) I/O data items
-bool IODECL(OutputInteger8)(Cookie, std::int8_t);
-bool IODECL(OutputInteger16)(Cookie, std::int16_t);
+bool IONAME(OutputInteger8)(Cookie, std::int8_t);
+bool IONAME(OutputInteger16)(Cookie, std::int16_t);
 bool IODECL(OutputInteger32)(Cookie, std::int32_t);
-bool IODECL(OutputInteger64)(Cookie, std::int64_t);
-bool IODECL(OutputInteger128)(Cookie, common::int128_t);
-bool IODECL(InputInteger)(Cookie, std::int64_t &, int kind = 8);
-bool IODECL(OutputReal32)(Cookie, float);
-bool IODECL(InputReal32)(Cookie, float &);
-bool IODECL(OutputReal64)(Cookie, double);
-bool IODECL(InputReal64)(Cookie, double &);
-bool IODECL(OutputComplex32)(Cookie, float, float);
-bool IODECL(InputComplex32)(Cookie, float[2]);
-bool IODECL(OutputComplex64)(Cookie, double, double);
-bool IODECL(InputComplex64)(Cookie, double[2]);
-bool IODECL(OutputCharacter)(Cookie, const char *, std::size_t, int kind = 1);
-bool IODECL(OutputAscii)(Cookie, const char *, std::size_t);
-bool IODECL(InputCharacter)(Cookie, char *, std::size_t, int kind = 1);
-bool IODECL(InputAscii)(Cookie, char *, std::size_t);
-bool IODECL(OutputLogical)(Cookie, bool);
-bool IODECL(InputLogical)(Cookie, bool &);
+bool IONAME(OutputInteger64)(Cookie, std::int64_t);
+bool IONAME(OutputInteger128)(Cookie, common::int128_t);
+bool IONAME(InputInteger)(Cookie, std::int64_t &, int kind = 8);
+bool IONAME(OutputReal32)(Cookie, float);
+bool IONAME(InputReal32)(Cookie, float &);
+bool IONAME(OutputReal64)(Cookie, double);
+bool IONAME(InputReal64)(Cookie, double &);
+bool IONAME(OutputComplex32)(Cookie, float, float);
+bool IONAME(InputComplex32)(Cookie, float[2]);
+bool IONAME(OutputComplex64)(Cookie, double, double);
+bool IONAME(InputComplex64)(Cookie, double[2]);
+bool IONAME(OutputCharacter)(Cookie, const char *, std::size_t, int kind = 1);
+bool IONAME(OutputAscii)(Cookie, const char *, std::size_t);
+bool IONAME(InputCharacter)(Cookie, char *, std::size_t, int kind = 1);
+bool IONAME(InputAscii)(Cookie, char *, std::size_t);
+bool IONAME(OutputLogical)(Cookie, bool);
+bool IONAME(InputLogical)(Cookie, bool &);
 
 // NAMELIST I/O must be the only data item in an (otherwise)
 // list-directed I/O statement.
-bool IODECL(OutputNamelist)(Cookie, const NamelistGroup &);
-bool IODECL(InputNamelist)(Cookie, const NamelistGroup &);
+bool IONAME(OutputNamelist)(Cookie, const NamelistGroup &);
+bool IONAME(InputNamelist)(Cookie, const NamelistGroup &);
 
 // When an I/O list item has a derived type with a specific defined
 // I/O subroutine of the appropriate generic kind for the active
@@ -294,9 +294,9 @@ bool IODECL(InputNamelist)(Cookie, const NamelistGroup &);
 // made such a generic interface inaccessible), these data item transfer
 // APIs enable the I/O runtime to make the right calls to defined I/O
 // subroutines.
-bool IODECL(OutputDerivedType)(
+bool IONAME(OutputDerivedType)(
     Cookie, const Descriptor &, const NonTbpDefinedIoTable *);
-bool IODECL(InputDerivedType)(
+bool IONAME(InputDerivedType)(
     Cookie, const Descriptor &, const NonTbpDefinedIoTable *);
 
 // Additional specifier interfaces for the connection-list of
@@ -304,56 +304,56 @@ bool IODECL(InputDerivedType)(
 // SetDelim(), GetIoMsg(), SetPad(), SetRound(), SetSign(),
 // & SetAsynchronous() are also acceptable for OPEN.
 // ACCESS=SEQUENTIAL, DIRECT, STREAM
-bool IODECL(SetAccess)(Cookie, const char *, std::size_t);
+bool IONAME(SetAccess)(Cookie, const char *, std::size_t);
 // ACTION=READ, WRITE, or READWRITE
-bool IODECL(SetAction)(Cookie, const char *, std::size_t);
+bool IONAME(SetAction)(Cookie, const char *, std::size_t);
 // CARRIAGECONTROL=LIST, FORTRAN, NONE
-bool IODECL(SetCarriagecontrol)(Cookie, const char *, std::size_t);
+bool IONAME(SetCarriagecontrol)(Cookie, const char *, std::size_t);
 // CONVERT=NATIVE, LITTLE_ENDIAN, BIG_ENDIAN, or SWAP
-bool IODECL(SetConvert)(Cookie, const char *, std::size_t);
+bool IONAME(SetConvert)(Cookie, const char *, std::size_t);
 // ENCODING=UTF-8, DEFAULT
-bool IODECL(SetEncoding)(Cookie, const char *, std::size_t);
+bool IONAME(SetEncoding)(Cookie, const char *, std::size_t);
 // FORM=FORMATTED, UNFORMATTED
-bool IODECL(SetForm)(Cookie, const char *, std::size_t);
+bool IONAME(SetForm)(Cookie, const char *, std::size_t);
 // POSITION=ASIS, REWIND, APPEND
-bool IODECL(SetPosition)(Cookie, const char *, std::size_t);
-bool IODECL(SetRecl)(Cookie, std::size_t); // RECL=
+bool IONAME(SetPosition)(Cookie, const char *, std::size_t);
+bool IONAME(SetRecl)(Cookie, std::size_t); // RECL=
 
 // STATUS can be set during an OPEN or CLOSE statement.
 // For OPEN: STATUS=OLD, NEW, SCRATCH, REPLACE, UNKNOWN
 // For CLOSE: STATUS=KEEP, DELETE
-bool IODECL(SetStatus)(Cookie, const char *, std::size_t);
+bool IONAME(SetStatus)(Cookie, const char *, std::size_t);
 
-bool IODECL(SetFile)(Cookie, const char *, std::size_t chars);
+bool IONAME(SetFile)(Cookie, const char *, std::size_t chars);
 
 // Acquires the runtime-created unit number for OPEN(NEWUNIT=)
-bool IODECL(GetNewUnit)(Cookie, int &, int kind = 4);
+bool IONAME(GetNewUnit)(Cookie, int &, int kind = 4);
 
 // READ(SIZE=), after all input items
-std::size_t IODECL(GetSize)(Cookie);
+std::size_t IONAME(GetSize)(Cookie);
 
 // INQUIRE(IOLENGTH=), after all output items
-std::size_t IODECL(GetIoLength)(Cookie);
+std::size_t IONAME(GetIoLength)(Cookie);
 
 // GetIoMsg() does not modify its argument unless an error or
 // end-of-record/file condition is present.
-void IODECL(GetIoMsg)(Cookie, char *, std::size_t); // IOMSG=
+void IONAME(GetIoMsg)(Cookie, char *, std::size_t); // IOMSG=
 
 // Defines ID= on READ/WRITE(ASYNCHRONOUS='YES')
-AsynchronousId IODECL(GetAsynchronousId)(Cookie);
+AsynchronousId IONAME(GetAsynchronousId)(Cookie);
 
 // INQUIRE() specifiers are mostly identified by their NUL-terminated
 // case-insensitive names.
 // ACCESS, ACTION, ASYNCHRONOUS, BLANK, CONVERT, DECIMAL, DELIM, DIRECT,
 // ENCODING, FORM, FORMATTED, NAME, PAD, POSITION, READ, READWRITE, ROUND,
 // SEQUENTIAL, SIGN, STREAM, UNFORMATTED, WRITE:
-bool IODECL(InquireCharacter)(Cookie, InquiryKeywordHash, char *, std::size_t);
+bool IONAME(InquireCharacter)(Cookie, InquiryKeywordHash, char *, std::size_t);
 // EXIST, NAMED, OPENED, and PENDING (without ID):
-bool IODECL(InquireLogical)(Cookie, InquiryKeywordHash, bool &);
+bool IONAME(InquireLogical)(Cookie, InquiryKeywordHash, bool &);
 // PENDING with ID
-bool IODECL(InquirePendingId)(Cookie, AsynchronousId, bool &);
+bool IONAME(InquirePendingId)(Cookie, AsynchronousId, bool &);
 // NEXTREC, NUMBER, POS, RECL, SIZE
-bool IODECL(InquireInteger64)(
+bool IONAME(InquireInteger64)(
     Cookie, InquiryKeywordHash, std::int64_t &, int kind = 8);
 
 // This function must be called to end an I/O statement, and its
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index b2c9665a28df..b74067a37777 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -49,7 +49,6 @@ static void SetEnvironmentDefaults(const EnvironmentDefaultList *envDefaults) {
   }
 }
 
-RT_OFFLOAD_API_GROUP_BEGIN
 Fortran::common::optional<Convert> GetConvertFromString(
     const char *x, std::size_t n) {
   static const char *keywords[]{
@@ -69,7 +68,6 @@ Fortran::common::optional<Convert> GetConvertFromString(
     return Fortran::common::nullopt;
   }
 }
-RT_OFFLOAD_API_GROUP_END
 
 void ExecutionEnvironment::Configure(int ac, const char *av[],
     const char *env[], const EnvironmentDefaultList *envDefaults) {
diff --git a/flang/runtime/environment.h b/flang/runtime/environment.h
index b8b9f10e4e57..6c56993fb1d6 100644
--- a/flang/runtime/environment.h
+++ b/flang/runtime/environment.h
@@ -31,7 +31,7 @@ RT_OFFLOAD_VAR_GROUP_END
 // External unformatted I/O data conversions
 enum class Convert { Unknown, Native, LittleEndian, BigEndian, Swap };
 
-RT_API_ATTRS Fortran::common::optional<Convert> GetConvertFromString(
+Fortran::common::optional<Convert> GetConvertFromString(
     const char *, std::size_t);
 
 struct ExecutionEnvironment {
diff --git a/flang/runtime/freestanding-tools.h b/flang/runtime/freestanding-tools.h
index 9089dc6bcf53..451bf13b9fa6 100644
--- a/flang/runtime/freestanding-tools.h
+++ b/flang/runtime/freestanding-tools.h
@@ -52,11 +52,6 @@
 #define STD_STRCPY_UNSUPPORTED 1
 #endif
 
-#if !defined(STD_STRCMP_UNSUPPORTED) && \
-    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
-#define STD_STRCMP_UNSUPPORTED 1
-#endif
-
 namespace Fortran::runtime {
 
 #if STD_FILL_N_UNSUPPORTED
@@ -181,19 +176,5 @@ static inline RT_API_ATTRS char *strcpy(char *dest, const char *src) {
 using std::strcpy;
 #endif // !STD_STRCPY_UNSUPPORTED
 
-#if STD_STRCMP_UNSUPPORTED
-// Provides alternative implementation for std::strcmp(), if
-// it is not supported.
-static inline RT_API_ATTRS int strcmp(const char *lhs, const char *rhs) {
-  while (*lhs != '\0' && *lhs == *rhs) {
-    ++lhs;
-    ++rhs;
-  }
-  return static_cast<unsigned char>(*lhs) - static_cast<unsigned char>(*rhs);
-}
-#else // !STD_STRCMP_UNSUPPORTED
-using std::strcmp;
-#endif // !STD_STRCMP_UNSUPPORTED
-
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
index ccb5b576451d..3a86c9fa7375 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -25,9 +25,8 @@
 #include <memory>
 
 namespace Fortran::runtime::io {
-RT_EXT_API_GROUP_BEGIN
 
-RT_API_ATTRS const char *InquiryKeywordHashDecode(
+const char *InquiryKeywordHashDecode(
     char *buffer, std::size_t n, InquiryKeywordHash hash) {
   if (n < 1) {
     return nullptr;
@@ -45,7 +44,7 @@ RT_API_ATTRS const char *InquiryKeywordHashDecode(
 }
 
 template <Direction DIR>
-RT_API_ATTRS Cookie BeginInternalArrayListIO(const Descriptor &descriptor,
+Cookie BeginInternalArrayListIO(const Descriptor &descriptor,
     void ** /*scratchArea*/, std::size_t /*scratchBytes*/,
     const char *sourceFile, int sourceLine) {
   Terminator oom{sourceFile, sourceLine};
@@ -55,14 +54,14 @@ RT_API_ATTRS Cookie BeginInternalArrayListIO(const Descriptor &descriptor,
               ->ioStatementState();
 }
 
-Cookie IODEF(BeginInternalArrayListOutput)(const Descriptor &descriptor,
+Cookie IONAME(BeginInternalArrayListOutput)(const Descriptor &descriptor,
     void **scratchArea, std::size_t scratchBytes, const char *sourceFile,
     int sourceLine) {
   return BeginInternalArrayListIO<Direction::Output>(
       descriptor, scratchArea, scratchBytes, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginInternalArrayListInput)(const Descriptor &descriptor,
+Cookie IONAME(BeginInternalArrayListInput)(const Descriptor &descriptor,
     void **scratchArea, std::size_t scratchBytes, const char *sourceFile,
     int sourceLine) {
   return BeginInternalArrayListIO<Direction::Input>(
@@ -70,7 +69,7 @@ Cookie IODEF(BeginInternalArrayListInput)(const Descriptor &descriptor,
 }
 
 template <Direction DIR>
-RT_API_ATTRS Cookie BeginInternalArrayFormattedIO(const Descriptor &descriptor,
+Cookie BeginInternalArrayFormattedIO(const Descriptor &descriptor,
     const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void ** /*scratchArea*/,
     std::size_t /*scratchBytes*/, const char *sourceFile, int sourceLine) {
@@ -81,7 +80,7 @@ RT_API_ATTRS Cookie BeginInternalArrayFormattedIO(const Descriptor &descriptor,
               ->ioStatementState();
 }
 
-Cookie IODEF(BeginInternalArrayFormattedOutput)(const Descriptor &descriptor,
+Cookie IONAME(BeginInternalArrayFormattedOutput)(const Descriptor &descriptor,
     const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void **scratchArea,
     std::size_t scratchBytes, const char *sourceFile, int sourceLine) {
@@ -90,7 +89,7 @@ Cookie IODEF(BeginInternalArrayFormattedOutput)(const Descriptor &descriptor,
       sourceLine);
 }
 
-Cookie IODEF(BeginInternalArrayFormattedInput)(const Descriptor &descriptor,
+Cookie IONAME(BeginInternalArrayFormattedInput)(const Descriptor &descriptor,
     const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void **scratchArea,
     std::size_t scratchBytes, const char *sourceFile, int sourceLine) {
@@ -111,14 +110,14 @@ RT_API_ATTRS Cookie BeginInternalListIO(
               ->ioStatementState();
 }
 
-Cookie IODEF(BeginInternalListOutput)(char *internal,
+Cookie IONAME(BeginInternalListOutput)(char *internal,
     std::size_t internalLength, void **scratchArea, std::size_t scratchBytes,
     const char *sourceFile, int sourceLine) {
   return BeginInternalListIO<Direction::Output>(internal, internalLength,
       scratchArea, scratchBytes, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginInternalListInput)(const char *internal,
+Cookie IONAME(BeginInternalListInput)(const char *internal,
     std::size_t internalLength, void **scratchArea, std::size_t scratchBytes,
     const char *sourceFile, int sourceLine) {
   return BeginInternalListIO<Direction::Input>(internal, internalLength,
@@ -126,7 +125,7 @@ Cookie IODEF(BeginInternalListInput)(const char *internal,
 }
 
 template <Direction DIR>
-RT_API_ATTRS Cookie BeginInternalFormattedIO(
+Cookie BeginInternalFormattedIO(
     std::conditional_t<DIR == Direction::Input, const char, char> *internal,
     std::size_t internalLength, const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void ** /*scratchArea*/,
@@ -139,7 +138,7 @@ RT_API_ATTRS Cookie BeginInternalFormattedIO(
               ->ioStatementState();
 }
 
-Cookie IODEF(BeginInternalFormattedOutput)(char *internal,
+Cookie IONAME(BeginInternalFormattedOutput)(char *internal,
     std::size_t internalLength, const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void **scratchArea,
     std::size_t scratchBytes, const char *sourceFile, int sourceLine) {
@@ -148,7 +147,7 @@ Cookie IODEF(BeginInternalFormattedOutput)(char *internal,
       sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginInternalFormattedInput)(const char *internal,
+Cookie IONAME(BeginInternalFormattedInput)(const char *internal,
     std::size_t internalLength, const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, void **scratchArea,
     std::size_t scratchBytes, const char *sourceFile, int sourceLine) {
@@ -228,22 +227,24 @@ RT_API_ATTRS Cookie BeginExternalListIO(
   }
 }
 
+RT_EXT_API_GROUP_BEGIN
 Cookie IODEF(BeginExternalListOutput)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginExternalListIO<Direction::Output, ExternalListIoStatementState>(
       unitNumber, sourceFile, sourceLine);
 }
+RT_EXT_API_GROUP_END
 
-Cookie IODEF(BeginExternalListInput)(
+Cookie IONAME(BeginExternalListInput)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginExternalListIO<Direction::Input, ExternalListIoStatementState>(
       unitNumber, sourceFile, sourceLine);
 }
 
 template <Direction DIR>
-RT_API_ATTRS Cookie BeginExternalFormattedIO(const char *format,
-    std::size_t formatLength, const Descriptor *formatDescriptor,
-    ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
+Cookie BeginExternalFormattedIO(const char *format, std::size_t formatLength,
+    const Descriptor *formatDescriptor, ExternalUnit unitNumber,
+    const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
   ExternalFileUnit *unit{GetOrCreateUnit(
@@ -285,14 +286,14 @@ RT_API_ATTRS Cookie BeginExternalFormattedIO(const char *format,
   }
 }
 
-Cookie IODEF(BeginExternalFormattedOutput)(const char *format,
+Cookie IONAME(BeginExternalFormattedOutput)(const char *format,
     std::size_t formatLength, const Descriptor *formatDescriptor,
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginExternalFormattedIO<Direction::Output>(format, formatLength,
       formatDescriptor, unitNumber, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginExternalFormattedInput)(const char *format,
+Cookie IONAME(BeginExternalFormattedInput)(const char *format,
     std::size_t formatLength, const Descriptor *formatDescriptor,
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginExternalFormattedIO<Direction::Input>(format, formatLength,
@@ -300,7 +301,7 @@ Cookie IODEF(BeginExternalFormattedInput)(const char *format,
 }
 
 template <Direction DIR>
-RT_API_ATTRS Cookie BeginUnformattedIO(
+Cookie BeginUnformattedIO(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
@@ -351,19 +352,19 @@ RT_API_ATTRS Cookie BeginUnformattedIO(
   }
 }
 
-Cookie IODEF(BeginUnformattedOutput)(
+Cookie IONAME(BeginUnformattedOutput)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginUnformattedIO<Direction::Output>(
       unitNumber, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginUnformattedInput)(
+Cookie IONAME(BeginUnformattedInput)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return BeginUnformattedIO<Direction::Input>(
       unitNumber, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginOpenUnit)( // OPEN(without NEWUNIT=)
+Cookie IONAME(BeginOpenUnit)( // OPEN(without NEWUNIT=)
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   bool wasExtant{false};
@@ -383,7 +384,7 @@ Cookie IODEF(BeginOpenUnit)( // OPEN(without NEWUNIT=)
   }
 }
 
-Cookie IODEF(BeginOpenNewUnit)( // OPEN(NEWUNIT=j)
+Cookie IONAME(BeginOpenNewUnit)( // OPEN(NEWUNIT=j)
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   ExternalFileUnit &unit{
@@ -393,7 +394,7 @@ Cookie IODEF(BeginOpenNewUnit)( // OPEN(NEWUNIT=j)
       sourceLine);
 }
 
-Cookie IODEF(BeginWait)(ExternalUnit unitNumber, AsynchronousId id,
+Cookie IONAME(BeginWait)(ExternalUnit unitNumber, AsynchronousId id,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) {
@@ -409,12 +410,12 @@ Cookie IODEF(BeginWait)(ExternalUnit unitNumber, AsynchronousId id,
         terminator, unitNumber, id == 0 ? IostatOk : IostatBadWaitUnit);
   }
 }
-Cookie IODEF(BeginWaitAll)(
+Cookie IONAME(BeginWaitAll)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   return IONAME(BeginWait)(unitNumber, 0 /*no ID=*/, sourceFile, sourceLine);
 }
 
-Cookie IODEF(BeginClose)(
+Cookie IONAME(BeginClose)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) {
@@ -433,7 +434,7 @@ Cookie IODEF(BeginClose)(
   }
 }
 
-Cookie IODEF(BeginFlush)(
+Cookie IONAME(BeginFlush)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) {
@@ -451,7 +452,7 @@ Cookie IODEF(BeginFlush)(
   }
 }
 
-Cookie IODEF(BeginBackspace)(
+Cookie IONAME(BeginBackspace)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) {
@@ -469,7 +470,7 @@ Cookie IODEF(BeginBackspace)(
   }
 }
 
-Cookie IODEF(BeginEndfile)(
+Cookie IONAME(BeginEndfile)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
@@ -489,7 +490,7 @@ Cookie IODEF(BeginEndfile)(
   }
 }
 
-Cookie IODEF(BeginRewind)(
+Cookie IONAME(BeginRewind)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
@@ -509,7 +510,7 @@ Cookie IODEF(BeginRewind)(
   }
 }
 
-Cookie IODEF(BeginInquireUnit)(
+Cookie IONAME(BeginInquireUnit)(
     ExternalUnit unitNumber, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) {
@@ -529,14 +530,14 @@ Cookie IODEF(BeginInquireUnit)(
   }
 }
 
-Cookie IODEF(BeginInquireFile)(const char *path, std::size_t pathLength,
+Cookie IONAME(BeginInquireFile)(const char *path, std::size_t pathLength,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   auto trimmed{SaveDefaultCharacter(
       path, TrimTrailingSpaces(path, pathLength), terminator)};
   if (ExternalFileUnit *
       unit{ExternalFileUnit::LookUp(
-          trimmed.get(), Fortran::runtime::strlen(trimmed.get()))}) {
+          trimmed.get(), std::strlen(trimmed.get()))}) {
     // INQUIRE(FILE=) to a connected unit
     if (ChildIo * child{unit->GetChildIo()}) {
       return &child->BeginIoStatement<InquireUnitState>(
@@ -553,7 +554,7 @@ Cookie IODEF(BeginInquireFile)(const char *path, std::size_t pathLength,
   }
 }
 
-Cookie IODEF(BeginInquireIoLength)(const char *sourceFile, int sourceLine) {
+Cookie IONAME(BeginInquireIoLength)(const char *sourceFile, int sourceLine) {
   Terminator oom{sourceFile, sourceLine};
   return &New<InquireIOLengthState>{oom}(sourceFile, sourceLine)
               .release()
@@ -562,7 +563,7 @@ Cookie IODEF(BeginInquireIoLength)(const char *sourceFile, int sourceLine) {
 
 // Control list items
 
-void IODEF(EnableHandlers)(Cookie cookie, bool hasIoStat, bool hasErr,
+void IONAME(EnableHandlers)(Cookie cookie, bool hasIoStat, bool hasErr,
     bool hasEnd, bool hasEor, bool hasIoMsg) {
   IoErrorHandler &handler{cookie->GetIoErrorHandler()};
   if (hasIoStat) {
@@ -582,8 +583,8 @@ void IODEF(EnableHandlers)(Cookie cookie, bool hasIoStat, bool hasErr,
   }
 }
 
-static RT_API_ATTRS bool YesOrNo(const char *keyword, std::size_t length,
-    const char *what, IoErrorHandler &handler) {
+static bool YesOrNo(const char *keyword, std::size_t length, const char *what,
+    IoErrorHandler &handler) {
   static const char *keywords[]{"YES", "NO", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
   case 0:
@@ -597,7 +598,8 @@ static RT_API_ATTRS bool YesOrNo(const char *keyword, std::size_t length,
   }
 }
 
-bool IODEF(SetAdvance)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetAdvance)(
+    Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   bool nonAdvancing{!YesOrNo(keyword, length, "ADVANCE", handler)};
@@ -614,7 +616,7 @@ bool IODEF(SetAdvance)(Cookie cookie, const char *keyword, std::size_t length) {
   return !handler.InError();
 }
 
-bool IODEF(SetBlank)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetBlank)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   static const char *keywords[]{"NULL", "ZERO", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
@@ -631,7 +633,8 @@ bool IODEF(SetBlank)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetDecimal)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetDecimal)(
+    Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   static const char *keywords[]{"COMMA", "POINT", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
@@ -648,7 +651,7 @@ bool IODEF(SetDecimal)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetDelim)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetDelim)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   static const char *keywords[]{"APOSTROPHE", "QUOTE", "NONE", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
@@ -668,14 +671,14 @@ bool IODEF(SetDelim)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetPad)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetPad)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   io.mutableModes().pad = YesOrNo(keyword, length, "PAD", handler);
   return !handler.InError();
 }
 
-bool IODEF(SetPos)(Cookie cookie, std::int64_t pos) {
+bool IONAME(SetPos)(Cookie cookie, std::int64_t pos) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (auto *unit{io.GetExternalFileUnit()}) {
@@ -686,7 +689,7 @@ bool IODEF(SetPos)(Cookie cookie, std::int64_t pos) {
   return false;
 }
 
-bool IODEF(SetRec)(Cookie cookie, std::int64_t rec) {
+bool IONAME(SetRec)(Cookie cookie, std::int64_t rec) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (auto *unit{io.GetExternalFileUnit()}) {
@@ -702,7 +705,7 @@ bool IODEF(SetRec)(Cookie cookie, std::int64_t rec) {
   return true;
 }
 
-bool IODEF(SetRound)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetRound)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   static const char *keywords[]{"UP", "DOWN", "ZERO", "NEAREST", "COMPATIBLE",
       "PROCESSOR_DEFINED", nullptr};
@@ -732,7 +735,7 @@ bool IODEF(SetRound)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetSign)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetSign)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   static const char *keywords[]{
       "PLUS", "SUPPRESS", "PROCESSOR_DEFINED", nullptr};
@@ -751,7 +754,7 @@ bool IODEF(SetSign)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetAccess)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetAccess)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -787,7 +790,7 @@ bool IODEF(SetAccess)(Cookie cookie, const char *keyword, std::size_t length) {
   return true;
 }
 
-bool IODEF(SetAction)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetAction)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -829,7 +832,7 @@ bool IODEF(SetAction)(Cookie cookie, const char *keyword, std::size_t length) {
   return true;
 }
 
-bool IODEF(SetAsynchronous)(
+bool IONAME(SetAsynchronous)(
     Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -856,7 +859,7 @@ bool IODEF(SetAsynchronous)(
   return !handler.InError();
 }
 
-bool IODEF(SetCarriagecontrol)(
+bool IONAME(SetCarriagecontrol)(
     Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
@@ -888,7 +891,8 @@ bool IODEF(SetCarriagecontrol)(
   }
 }
 
-bool IODEF(SetConvert)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetConvert)(
+    Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -912,7 +916,7 @@ bool IODEF(SetConvert)(Cookie cookie, const char *keyword, std::size_t length) {
   }
 }
 
-bool IODEF(SetEncoding)(
+bool IONAME(SetEncoding)(
     Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
@@ -944,7 +948,7 @@ bool IODEF(SetEncoding)(
   return true;
 }
 
-bool IODEF(SetForm)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetForm)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -972,7 +976,7 @@ bool IODEF(SetForm)(Cookie cookie, const char *keyword, std::size_t length) {
   return true;
 }
 
-bool IODEF(SetPosition)(
+bool IONAME(SetPosition)(
     Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
@@ -1005,7 +1009,7 @@ bool IODEF(SetPosition)(
   return true;
 }
 
-bool IODEF(SetRecl)(Cookie cookie, std::size_t n) {
+bool IONAME(SetRecl)(Cookie cookie, std::size_t n) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -1032,7 +1036,7 @@ bool IODEF(SetRecl)(Cookie cookie, std::size_t n) {
   }
 }
 
-bool IODEF(SetStatus)(Cookie cookie, const char *keyword, std::size_t length) {
+bool IONAME(SetStatus)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   if (auto *open{io.get_if<OpenStatementState>()}) {
     if (open->completedOperation()) {
@@ -1086,7 +1090,7 @@ bool IODEF(SetStatus)(Cookie cookie, const char *keyword, std::size_t length) {
       "SetStatus() called when not in an OPEN or CLOSE statement");
 }
 
-bool IODEF(SetFile)(Cookie cookie, const char *path, std::size_t chars) {
+bool IONAME(SetFile)(Cookie cookie, const char *path, std::size_t chars) {
   IoStatementState &io{*cookie};
   if (auto *open{io.get_if<OpenStatementState>()}) {
     if (open->completedOperation()) {
@@ -1103,7 +1107,7 @@ bool IODEF(SetFile)(Cookie cookie, const char *path, std::size_t chars) {
   return false;
 }
 
-bool IODEF(GetNewUnit)(Cookie cookie, int &unit, int kind) {
+bool IONAME(GetNewUnit)(Cookie cookie, int &unit, int kind) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
@@ -1131,15 +1135,15 @@ bool IODEF(GetNewUnit)(Cookie cookie, int &unit, int kind) {
 
 // Data transfers
 
-bool IODEF(OutputDescriptor)(Cookie cookie, const Descriptor &descriptor) {
+bool IONAME(OutputDescriptor)(Cookie cookie, const Descriptor &descriptor) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(InputDescriptor)(Cookie cookie, const Descriptor &descriptor) {
+bool IONAME(InputDescriptor)(Cookie cookie, const Descriptor &descriptor) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(OutputInteger8)(Cookie cookie, std::int8_t n) {
+bool IONAME(OutputInteger8)(Cookie cookie, std::int8_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger8")) {
     return false;
   }
@@ -1150,7 +1154,7 @@ bool IODEF(OutputInteger8)(Cookie cookie, std::int8_t n) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(OutputInteger16)(Cookie cookie, std::int16_t n) {
+bool IONAME(OutputInteger16)(Cookie cookie, std::int16_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger16")) {
     return false;
   }
@@ -1161,6 +1165,7 @@ bool IODEF(OutputInteger16)(Cookie cookie, std::int16_t n) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
+RT_EXT_API_GROUP_BEGIN
 bool IODEF(OutputInteger32)(Cookie cookie, std::int32_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger32")) {
     return false;
@@ -1171,8 +1176,9 @@ bool IODEF(OutputInteger32)(Cookie cookie, std::int32_t n) {
       TypeCategory::Integer, 4, reinterpret_cast<void *>(&n), 0);
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
+RT_EXT_API_GROUP_END
 
-bool IODEF(OutputInteger64)(Cookie cookie, std::int64_t n) {
+bool IONAME(OutputInteger64)(Cookie cookie, std::int64_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger64")) {
     return false;
   }
@@ -1184,7 +1190,7 @@ bool IODEF(OutputInteger64)(Cookie cookie, std::int64_t n) {
 }
 
 #ifdef __SIZEOF_INT128__
-bool IODEF(OutputInteger128)(Cookie cookie, common::int128_t n) {
+bool IONAME(OutputInteger128)(Cookie cookie, common::int128_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger128")) {
     return false;
   }
@@ -1196,7 +1202,7 @@ bool IODEF(OutputInteger128)(Cookie cookie, common::int128_t n) {
 }
 #endif
 
-bool IODEF(InputInteger)(Cookie cookie, std::int64_t &n, int kind) {
+bool IONAME(InputInteger)(Cookie cookie, std::int64_t &n, int kind) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputInteger")) {
     return false;
   }
@@ -1207,7 +1213,7 @@ bool IODEF(InputInteger)(Cookie cookie, std::int64_t &n, int kind) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(OutputReal32)(Cookie cookie, float x) {
+bool IONAME(OutputReal32)(Cookie cookie, float x) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputReal32")) {
     return false;
   }
@@ -1217,7 +1223,7 @@ bool IODEF(OutputReal32)(Cookie cookie, float x) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(OutputReal64)(Cookie cookie, double x) {
+bool IONAME(OutputReal64)(Cookie cookie, double x) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputReal64")) {
     return false;
   }
@@ -1227,7 +1233,7 @@ bool IODEF(OutputReal64)(Cookie cookie, double x) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(InputReal32)(Cookie cookie, float &x) {
+bool IONAME(InputReal32)(Cookie cookie, float &x) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputReal32")) {
     return false;
   }
@@ -1237,7 +1243,7 @@ bool IODEF(InputReal32)(Cookie cookie, float &x) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(InputReal64)(Cookie cookie, double &x) {
+bool IONAME(InputReal64)(Cookie cookie, double &x) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputReal64")) {
     return false;
   }
@@ -1247,7 +1253,7 @@ bool IODEF(InputReal64)(Cookie cookie, double &x) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(OutputComplex32)(Cookie cookie, float r, float i) {
+bool IONAME(OutputComplex32)(Cookie cookie, float r, float i) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputComplex32")) {
     return false;
   }
@@ -1259,7 +1265,7 @@ bool IODEF(OutputComplex32)(Cookie cookie, float r, float i) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(OutputComplex64)(Cookie cookie, double r, double i) {
+bool IONAME(OutputComplex64)(Cookie cookie, double r, double i) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputComplex64")) {
     return false;
   }
@@ -1271,7 +1277,7 @@ bool IODEF(OutputComplex64)(Cookie cookie, double r, double i) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(InputComplex32)(Cookie cookie, float z[2]) {
+bool IONAME(InputComplex32)(Cookie cookie, float z[2]) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputComplex32")) {
     return false;
   }
@@ -1282,7 +1288,7 @@ bool IODEF(InputComplex32)(Cookie cookie, float z[2]) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(InputComplex64)(Cookie cookie, double z[2]) {
+bool IONAME(InputComplex64)(Cookie cookie, double z[2]) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputComplex64")) {
     return false;
   }
@@ -1293,7 +1299,7 @@ bool IODEF(InputComplex64)(Cookie cookie, double z[2]) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(OutputCharacter)(
+bool IONAME(OutputCharacter)(
     Cookie cookie, const char *x, std::size_t length, int kind) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputCharacter")) {
     return false;
@@ -1305,11 +1311,11 @@ bool IODEF(OutputCharacter)(
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(OutputAscii)(Cookie cookie, const char *x, std::size_t length) {
+bool IONAME(OutputAscii)(Cookie cookie, const char *x, std::size_t length) {
   return IONAME(OutputCharacter(cookie, x, length, 1));
 }
 
-bool IODEF(InputCharacter)(
+bool IONAME(InputCharacter)(
     Cookie cookie, char *x, std::size_t length, int kind) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputCharacter")) {
     return false;
@@ -1320,11 +1326,11 @@ bool IODEF(InputCharacter)(
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(InputAscii)(Cookie cookie, char *x, std::size_t length) {
+bool IONAME(InputAscii)(Cookie cookie, char *x, std::size_t length) {
   return IONAME(InputCharacter)(cookie, x, length, 1);
 }
 
-bool IODEF(OutputLogical)(Cookie cookie, bool truth) {
+bool IONAME(OutputLogical)(Cookie cookie, bool truth) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputLogical")) {
     return false;
   }
@@ -1335,7 +1341,7 @@ bool IODEF(OutputLogical)(Cookie cookie, bool truth) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
 }
 
-bool IODEF(InputLogical)(Cookie cookie, bool &truth) {
+bool IONAME(InputLogical)(Cookie cookie, bool &truth) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputLogical")) {
     return false;
   }
@@ -1346,17 +1352,17 @@ bool IODEF(InputLogical)(Cookie cookie, bool &truth) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
 }
 
-bool IODEF(OutputDerivedType)(Cookie cookie, const Descriptor &descriptor,
+bool IONAME(OutputDerivedType)(Cookie cookie, const Descriptor &descriptor,
     const NonTbpDefinedIoTable *table) {
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor, table);
 }
 
-bool IODEF(InputDerivedType)(Cookie cookie, const Descriptor &descriptor,
+bool IONAME(InputDerivedType)(Cookie cookie, const Descriptor &descriptor,
     const NonTbpDefinedIoTable *table) {
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor, table);
 }
 
-std::size_t IODEF(GetSize)(Cookie cookie) {
+std::size_t IONAME(GetSize)(Cookie cookie) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (!handler.InError()) {
@@ -1373,7 +1379,7 @@ std::size_t IODEF(GetSize)(Cookie cookie) {
   return 0;
 }
 
-std::size_t IODEF(GetIoLength)(Cookie cookie) {
+std::size_t IONAME(GetIoLength)(Cookie cookie) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (!handler.InError()) {
@@ -1389,7 +1395,7 @@ std::size_t IODEF(GetIoLength)(Cookie cookie) {
   return 0;
 }
 
-void IODEF(GetIoMsg)(Cookie cookie, char *msg, std::size_t length) {
+void IONAME(GetIoMsg)(Cookie cookie, char *msg, std::size_t length) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (!handler.InError()) {
@@ -1400,7 +1406,7 @@ void IODEF(GetIoMsg)(Cookie cookie, char *msg, std::size_t length) {
   }
 }
 
-AsynchronousId IODEF(GetAsynchronousId)(Cookie cookie) {
+AsynchronousId IONAME(GetAsynchronousId)(Cookie cookie) {
   IoStatementState &io{*cookie};
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   if (auto *ext{io.get_if<ExternalIoStatementBase>()}) {
@@ -1413,24 +1419,24 @@ AsynchronousId IODEF(GetAsynchronousId)(Cookie cookie) {
   return 0;
 }
 
-bool IODEF(InquireCharacter)(Cookie cookie, InquiryKeywordHash inquiry,
+bool IONAME(InquireCharacter)(Cookie cookie, InquiryKeywordHash inquiry,
     char *result, std::size_t length) {
   IoStatementState &io{*cookie};
   return io.Inquire(inquiry, result, length);
 }
 
-bool IODEF(InquireLogical)(
+bool IONAME(InquireLogical)(
     Cookie cookie, InquiryKeywordHash inquiry, bool &result) {
   IoStatementState &io{*cookie};
   return io.Inquire(inquiry, result);
 }
 
-bool IODEF(InquirePendingId)(Cookie cookie, AsynchronousId id, bool &result) {
+bool IONAME(InquirePendingId)(Cookie cookie, AsynchronousId id, bool &result) {
   IoStatementState &io{*cookie};
   return io.Inquire(HashInquiryKeyword("PENDING"), id, result);
 }
 
-bool IODEF(InquireInteger64)(
+bool IONAME(InquireInteger64)(
     Cookie cookie, InquiryKeywordHash inquiry, std::int64_t &result, int kind) {
   IoStatementState &io{*cookie};
   std::int64_t n{0}; // safe "undefined" value
@@ -1446,15 +1452,17 @@ bool IODEF(InquireInteger64)(
   return false;
 }
 
+RT_EXT_API_GROUP_BEGIN
 enum Iostat IODEF(EndIoStatement)(Cookie cookie) {
   IoStatementState &io{*cookie};
   return static_cast<enum Iostat>(io.EndIoStatement());
 }
+RT_EXT_API_GROUP_END
 
 template <typename INT>
-static RT_API_ATTRS enum Iostat CheckUnitNumberInRangeImpl(INT unit,
-    bool handleError, char *ioMsg, std::size_t ioMsgLength,
-    const char *sourceFile, int sourceLine) {
+static enum Iostat CheckUnitNumberInRangeImpl(INT unit, bool handleError,
+    char *ioMsg, std::size_t ioMsgLength, const char *sourceFile,
+    int sourceLine) {
   static_assert(sizeof(INT) >= sizeof(ExternalUnit),
       "only intended to be used when the INT to ExternalUnit conversion is "
       "narrowing");
@@ -1486,15 +1494,15 @@ static RT_API_ATTRS enum Iostat CheckUnitNumberInRangeImpl(INT unit,
   return IostatOk;
 }
 
-enum Iostat IODEF(CheckUnitNumberInRange64)(std::int64_t unit, bool handleError,
-    char *ioMsg, std::size_t ioMsgLength, const char *sourceFile,
-    int sourceLine) {
+enum Iostat IONAME(CheckUnitNumberInRange64)(std::int64_t unit,
+    bool handleError, char *ioMsg, std::size_t ioMsgLength,
+    const char *sourceFile, int sourceLine) {
   return CheckUnitNumberInRangeImpl(
       unit, handleError, ioMsg, ioMsgLength, sourceFile, sourceLine);
 }
 
 #ifdef __SIZEOF_INT128__
-enum Iostat IODEF(CheckUnitNumberInRange128)(common::int128_t unit,
+enum Iostat IONAME(CheckUnitNumberInRange128)(common::int128_t unit,
     bool handleError, char *ioMsg, std::size_t ioMsgLength,
     const char *sourceFile, int sourceLine) {
   return CheckUnitNumberInRangeImpl(
@@ -1517,5 +1525,3 @@ void std::__libcpp_verbose_abort(char const *format, ...) {
   std::abort();
 }
 #endif
-
-RT_EXT_API_GROUP_END
diff --git a/flang/runtime/io-error.cpp b/flang/runtime/io-error.cpp
index 7a90966f8104..b006b82f6224 100644
--- a/flang/runtime/io-error.cpp
+++ b/flang/runtime/io-error.cpp
@@ -109,6 +109,8 @@ void IoErrorHandler::SignalPendingError() {
   SignalError(error);
 }
 
+RT_OFFLOAD_API_GROUP_END
+
 void IoErrorHandler::SignalErrno() { SignalError(errno); }
 
 bool IoErrorHandler::GetIoMsg(char *buffer, std::size_t bufferLength) {
@@ -125,10 +127,7 @@ bool IoErrorHandler::GetIoMsg(char *buffer, std::size_t bufferLength) {
   // in LLVM v9.0.1 with inadequate modification for Fortran,
   // since rectified.
   bool ok{false};
-#if defined(RT_DEVICE_COMPILATION)
-  // strerror_r is not available on device.
-  msg = "errno description is not available on device";
-#elif HAVE_STRERROR_R
+#if HAVE_STRERROR_R
   // strerror_r is thread-safe.
 #if defined(__GLIBC__) && defined(_GNU_SOURCE)
   // glibc defines its own incompatible version of strerror_r
@@ -158,6 +157,4 @@ bool IoErrorHandler::GetIoMsg(char *buffer, std::size_t bufferLength) {
     return false;
   }
 }
-
-RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/io-error.h b/flang/runtime/io-error.h
index 426573e2faf0..0fe11c9185c0 100644
--- a/flang/runtime/io-error.h
+++ b/flang/runtime/io-error.h
@@ -61,7 +61,7 @@ public:
   RT_API_ATTRS void SignalPendingError();
 
   RT_API_ATTRS int GetIoStat() const { return ioStat_; }
-  RT_API_ATTRS bool GetIoMsg(char *, std::size_t);
+  bool GetIoMsg(char *, std::size_t);
 
 private:
   enum Flag : std::uint8_t {
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index b9eed2101ecf..b502d41a8d5c 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -17,20 +17,16 @@
 
 namespace Fortran::runtime::io {
 
-RT_VAR_GROUP_BEGIN
 // Max size of a group, symbol or component identifier that can appear in
 // NAMELIST input, plus a byte for NUL termination.
-static constexpr RT_CONST_VAR_ATTRS std::size_t nameBufferSize{201};
-RT_VAR_GROUP_END
+static constexpr std::size_t nameBufferSize{201};
 
-RT_OFFLOAD_API_GROUP_BEGIN
-
-static inline RT_API_ATTRS char32_t GetComma(IoStatementState &io) {
+static inline char32_t GetComma(IoStatementState &io) {
   return io.mutableModes().editingFlags & decimalComma ? char32_t{';'}
                                                        : char32_t{','};
 }
 
-bool IODEF(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
+bool IONAME(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
   IoStatementState &io{*cookie};
   io.CheckFormattedStmtType<Direction::Output>("OutputNamelist");
   io.mutableModes().inNamelist = true;
@@ -44,8 +40,7 @@ bool IODEF(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
     if ((connection.NeedAdvance(prefixLen) &&
             !(io.AdvanceRecord() && EmitAscii(io, " ", 1))) ||
         !EmitAscii(io, prefix, prefixLen) ||
-        (connection.NeedAdvance(
-             Fortran::runtime::strlen(str) + (suffix != ' ')) &&
+        (connection.NeedAdvance(std::strlen(str) + (suffix != ' ')) &&
             !(io.AdvanceRecord() && EmitAscii(io, " ", 1)))) {
       return false;
     }
@@ -89,20 +84,20 @@ bool IODEF(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
   return EmitUpperCase("/", 1, "", ' ');
 }
 
-static constexpr RT_API_ATTRS bool IsLegalIdStart(char32_t ch) {
+static constexpr bool IsLegalIdStart(char32_t ch) {
   return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' ||
       ch == '@';
 }
 
-static constexpr RT_API_ATTRS bool IsLegalIdChar(char32_t ch) {
+static constexpr bool IsLegalIdChar(char32_t ch) {
   return IsLegalIdStart(ch) || (ch >= '0' && ch <= '9');
 }
 
-static constexpr RT_API_ATTRS char NormalizeIdChar(char32_t ch) {
+static constexpr char NormalizeIdChar(char32_t ch) {
   return static_cast<char>(ch >= 'A' && ch <= 'Z' ? ch - 'A' + 'a' : ch);
 }
 
-static RT_API_ATTRS bool GetLowerCaseName(
+static bool GetLowerCaseName(
     IoStatementState &io, char buffer[], std::size_t maxLength) {
   std::size_t byteLength{0};
   if (auto ch{io.GetNextNonBlank(byteLength)}) {
@@ -124,7 +119,7 @@ static RT_API_ATTRS bool GetLowerCaseName(
   return false;
 }
 
-static RT_API_ATTRS Fortran::common::optional<SubscriptValue> GetSubscriptValue(
+static Fortran::common::optional<SubscriptValue> GetSubscriptValue(
     IoStatementState &io) {
   Fortran::common::optional<SubscriptValue> value;
   std::size_t byteCount{0};
@@ -157,8 +152,8 @@ static RT_API_ATTRS Fortran::common::optional<SubscriptValue> GetSubscriptValue(
   return value;
 }
 
-static RT_API_ATTRS bool HandleSubscripts(IoStatementState &io,
-    Descriptor &desc, const Descriptor &source, const char *name) {
+static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
+    const Descriptor &source, const char *name) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   // Allow for blanks in subscripts; they're nonstandard, but not
   // ambiguous within the parentheses.
@@ -257,7 +252,7 @@ static RT_API_ATTRS bool HandleSubscripts(IoStatementState &io,
   return false;
 }
 
-static RT_API_ATTRS void StorageSequenceExtension(
+static void StorageSequenceExtension(
     Descriptor &desc, const Descriptor &source) {
   // Support the near-universal extension of NAMELIST input into a
   // designatable storage sequence identified by its initial scalar array
@@ -279,7 +274,7 @@ static RT_API_ATTRS void StorageSequenceExtension(
   }
 }
 
-static RT_API_ATTRS bool HandleSubstring(
+static bool HandleSubstring(
     IoStatementState &io, Descriptor &desc, const char *name) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   auto pair{desc.type().GetCategoryAndKind()};
@@ -340,7 +335,7 @@ static RT_API_ATTRS bool HandleSubstring(
   return false;
 }
 
-static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
+static bool HandleComponent(IoStatementState &io, Descriptor &desc,
     const Descriptor &source, const char *name) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
   char compName[nameBufferSize];
@@ -349,8 +344,7 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
     if (const typeInfo::DerivedType *
         type{addendum ? addendum->derivedType() : nullptr}) {
       if (const typeInfo::Component *
-          comp{type->FindDataComponent(
-              compName, Fortran::runtime::strlen(compName))}) {
+          comp{type->FindDataComponent(compName, std::strlen(compName))}) {
         bool createdDesc{false};
         if (comp->rank() > 0 && source.rank() > 0) {
           // If base and component are both arrays, the component name
@@ -414,7 +408,7 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc,
 
 // Advance to the terminal '/' of a namelist group or leading '&'/'$'
 // of the next.
-static RT_API_ATTRS void SkipNamelistGroup(IoStatementState &io) {
+static void SkipNamelistGroup(IoStatementState &io) {
   std::size_t byteCount{0};
   while (auto ch{io.GetNextNonBlank(byteCount)}) {
     io.HandleRelativePosition(byteCount);
@@ -437,7 +431,7 @@ static RT_API_ATTRS void SkipNamelistGroup(IoStatementState &io) {
   }
 }
 
-bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
+bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   IoStatementState &io{*cookie};
   io.CheckFormattedStmtType<Direction::Input>("InputNamelist");
   io.mutableModes().inNamelist = true;
@@ -476,7 +470,7 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
       handler.SignalError("NAMELIST input group has no name");
       return false;
     }
-    if (Fortran::runtime::strcmp(group.groupName, name) == 0) {
+    if (std::strcmp(group.groupName, name) == 0) {
       break; // found it
     }
     SkipNamelistGroup(io);
@@ -495,7 +489,7 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
     }
     std::size_t itemIndex{0};
     for (; itemIndex < group.items; ++itemIndex) {
-      if (Fortran::runtime::strcmp(name, group.item[itemIndex].name) == 0) {
+      if (std::strcmp(name, group.item[itemIndex].name) == 0) {
         break;
       }
     }
@@ -596,6 +590,8 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   return true;
 }
 
+RT_OFFLOAD_API_GROUP_BEGIN
+
 bool IsNamelistNameOrSlash(IoStatementState &io) {
   if (auto *listInput{
           io.get_if<ListDirectedStatementState<Direction::Input>>()}) {
-- 
cgit v1.2.3


From bbd259af0a4cc438dd02d5ee632cb2dc1def1f6a Mon Sep 17 00:00:00 2001
From: Yitzhak Mandelbaum <ymand@users.noreply.github.com>
Date: Thu, 4 Apr 2024 08:39:51 -0400
Subject: [clang][dataflow] Refactor `widen` API to be explicit about change
 effect. (#87233)

The previous API relied on pointer equality of inputs and outputs to
signal whether a change occured. This was too subtle and led to bugs in
practice. It was also very limiting: the override could not return an equivalent (but
not identical) value.
---
 .../Analysis/FlowSensitive/DataflowEnvironment.h   | 50 +++++++++------
 .../clang/Analysis/FlowSensitive/DataflowLattice.h |  8 +--
 .../Analysis/FlowSensitive/DataflowEnvironment.cpp | 49 +++++++--------
 .../TypeErasedDataflowAnalysisTest.cpp             | 72 +++++++++++++++-------
 4 files changed, 111 insertions(+), 68 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index c30bccd06674..9a65f76cdf56 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -43,6 +43,15 @@ enum class ComparisonResult {
   Unknown,
 };
 
+/// The result of a `widen` operation.
+struct WidenResult {
+  /// Non-null pointer to a potentially widened version of the input value.
+  Value *V;
+  /// Whether `V` represents a "change" (that is, a different value) with
+  /// respect to the previous value in the sequence.
+  LatticeEffect Effect;
+};
+
 /// Holds the state of the program (store and heap) at a given program point.
 ///
 /// WARNING: Symbolic values that are created by the environment for static
@@ -104,14 +113,17 @@ public:
     /// serve as a comparison operation, by indicating whether the widened value
     /// is equivalent to the previous value.
     ///
-    /// Returns either:
-    ///
-    ///   `nullptr`, if this value is not of interest to the model, or
-    ///
-    ///   `&Prev`, if the widened value is equivalent to `Prev`, or
-    ///
-    ///   A non-null value that approximates `Current`. `Prev` is available to
-    ///   inform the chosen approximation.
+    /// Returns one of the folowing:
+    /// *  `std::nullopt`, if this value is not of interest to the
+    ///     model.
+    /// *  A `WidenResult` with:
+    ///    *  A non-null `Value *` that points either to `Current` or a widened
+    ///       version of `Current`. This value must be consistent with
+    ///       the flow condition of `CurrentEnv`. We particularly caution
+    ///       against using `Prev`, which is rarely consistent.
+    ///    *  A `LatticeEffect` indicating whether the value should be
+    ///       considered a new value (`Changed`) or one *equivalent* (if not
+    ///       necessarily equal) to `Prev` (`Unchanged`).
     ///
     /// `PrevEnv` and `CurrentEnv` can be used to query child values and path
     /// condition implications of `Prev` and `Current`, respectively.
@@ -122,17 +134,19 @@ public:
     ///
     ///  `Prev` and `Current` must be assigned to the same storage location in
     ///  `PrevEnv` and `CurrentEnv`, respectively.
-    virtual Value *widen(QualType Type, Value &Prev, const Environment &PrevEnv,
-                         Value &Current, Environment &CurrentEnv) {
+    virtual std::optional<WidenResult> widen(QualType Type, Value &Prev,
+                                             const Environment &PrevEnv,
+                                             Value &Current,
+                                             Environment &CurrentEnv) {
       // The default implementation reduces to just comparison, since comparison
       // is required by the API, even if no widening is performed.
       switch (compare(Type, Prev, PrevEnv, Current, CurrentEnv)) {
-        case ComparisonResult::Same:
-          return &Prev;
-        case ComparisonResult::Different:
-          return &Current;
-        case ComparisonResult::Unknown:
-          return nullptr;
+      case ComparisonResult::Unknown:
+        return std::nullopt;
+      case ComparisonResult::Same:
+        return WidenResult{&Current, LatticeEffect::Unchanged};
+      case ComparisonResult::Different:
+        return WidenResult{&Current, LatticeEffect::Changed};
       }
       llvm_unreachable("all cases in switch covered");
     }
@@ -236,8 +250,8 @@ public:
   ///
   ///  `PrevEnv` must be the immediate previous version of the environment.
   ///  `PrevEnv` and `this` must use the same `DataflowAnalysisContext`.
-  LatticeJoinEffect widen(const Environment &PrevEnv,
-                          Environment::ValueModel &Model);
+  LatticeEffect widen(const Environment &PrevEnv,
+                      Environment::ValueModel &Model);
 
   // FIXME: Rename `createOrGetStorageLocation` to `getOrCreateStorageLocation`,
   // `getStableStorageLocation`, or something more appropriate.
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowLattice.h b/clang/include/clang/Analysis/FlowSensitive/DataflowLattice.h
index 0c81e2f078c2..b262732804ed 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowLattice.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowLattice.h
@@ -17,13 +17,13 @@
 namespace clang {
 namespace dataflow {
 
-/// Effect indicating whether a lattice join operation resulted in a new value.
-// FIXME: Rename to `LatticeEffect` since `widen` uses it as well, and we are
-// likely removing it from `join`.
-enum class LatticeJoinEffect {
+/// Effect indicating whether a lattice operation resulted in a new value.
+enum class LatticeEffect {
   Unchanged,
   Changed,
 };
+// DEPRECATED. Use `LatticeEffect`.
+using LatticeJoinEffect = LatticeEffect;
 
 } // namespace dataflow
 } // namespace clang
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index f729d676dd0d..70ac0764476f 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -166,17 +166,16 @@ static Value *joinDistinctValues(QualType Type, Value &Val1,
   return JoinedVal;
 }
 
-// When widening does not change `Current`, return value will equal `&Prev`.
-static Value &widenDistinctValues(QualType Type, Value &Prev,
-                                  const Environment &PrevEnv, Value &Current,
-                                  Environment &CurrentEnv,
-                                  Environment::ValueModel &Model) {
+static WidenResult widenDistinctValues(QualType Type, Value &Prev,
+                                       const Environment &PrevEnv,
+                                       Value &Current, Environment &CurrentEnv,
+                                       Environment::ValueModel &Model) {
   // Boolean-model widening.
   if (auto *PrevBool = dyn_cast<BoolValue>(&Prev)) {
-    // If previous value was already Top, re-use that to (implicitly) indicate
-    // that no change occurred.
     if (isa<TopBoolValue>(Prev))
-      return Prev;
+      // Safe to return `Prev` here, because Top is never dependent on the
+      // environment.
+      return {&Prev, LatticeEffect::Unchanged};
 
     // We may need to widen to Top, but before we do so, check whether both
     // values are implied to be either true or false in the current environment.
@@ -185,22 +184,24 @@ static Value &widenDistinctValues(QualType Type, Value &Prev,
     bool TruePrev = PrevEnv.proves(PrevBool->formula());
     bool TrueCur = CurrentEnv.proves(CurBool.formula());
     if (TruePrev && TrueCur)
-      return CurrentEnv.getBoolLiteralValue(true);
+      return {&CurrentEnv.getBoolLiteralValue(true), LatticeEffect::Unchanged};
     if (!TruePrev && !TrueCur &&
         PrevEnv.proves(PrevEnv.arena().makeNot(PrevBool->formula())) &&
         CurrentEnv.proves(CurrentEnv.arena().makeNot(CurBool.formula())))
-      return CurrentEnv.getBoolLiteralValue(false);
+      return {&CurrentEnv.getBoolLiteralValue(false), LatticeEffect::Unchanged};
 
-    return CurrentEnv.makeTopBoolValue();
+    return {&CurrentEnv.makeTopBoolValue(), LatticeEffect::Changed};
   }
 
   // FIXME: Add other built-in model widening.
 
   // Custom-model widening.
-  if (auto *W = Model.widen(Type, Prev, PrevEnv, Current, CurrentEnv))
-    return *W;
+  if (auto Result = Model.widen(Type, Prev, PrevEnv, Current, CurrentEnv))
+    return *Result;
 
-  return equateUnknownValues(Prev.getKind()) ? Prev : Current;
+  return {&Current, equateUnknownValues(Prev.getKind())
+                        ? LatticeEffect::Unchanged
+                        : LatticeEffect::Changed};
 }
 
 // Returns whether the values in `Map1` and `Map2` compare equal for those
@@ -271,7 +272,7 @@ llvm::MapVector<Key, Value *>
 widenKeyToValueMap(const llvm::MapVector<Key, Value *> &CurMap,
                    const llvm::MapVector<Key, Value *> &PrevMap,
                    Environment &CurEnv, const Environment &PrevEnv,
-                   Environment::ValueModel &Model, LatticeJoinEffect &Effect) {
+                   Environment::ValueModel &Model, LatticeEffect &Effect) {
   llvm::MapVector<Key, Value *> WidenedMap;
   for (auto &Entry : CurMap) {
     Key K = Entry.first;
@@ -290,11 +291,11 @@ widenKeyToValueMap(const llvm::MapVector<Key, Value *> &CurMap,
       continue;
     }
 
-    Value &WidenedVal = widenDistinctValues(K->getType(), *PrevIt->second,
-                                            PrevEnv, *Val, CurEnv, Model);
-    WidenedMap.insert({K, &WidenedVal});
-    if (&WidenedVal != PrevIt->second)
-      Effect = LatticeJoinEffect::Changed;
+    auto [WidenedVal, ValEffect] = widenDistinctValues(
+        K->getType(), *PrevIt->second, PrevEnv, *Val, CurEnv, Model);
+    WidenedMap.insert({K, WidenedVal});
+    if (ValEffect == LatticeEffect::Changed)
+      Effect = LatticeEffect::Changed;
   }
 
   return WidenedMap;
@@ -617,15 +618,15 @@ bool Environment::equivalentTo(const Environment &Other,
   return true;
 }
 
-LatticeJoinEffect Environment::widen(const Environment &PrevEnv,
-                                     Environment::ValueModel &Model) {
+LatticeEffect Environment::widen(const Environment &PrevEnv,
+                                 Environment::ValueModel &Model) {
   assert(DACtx == PrevEnv.DACtx);
   assert(ReturnVal == PrevEnv.ReturnVal);
   assert(ReturnLoc == PrevEnv.ReturnLoc);
   assert(ThisPointeeLoc == PrevEnv.ThisPointeeLoc);
   assert(CallStack == PrevEnv.CallStack);
 
-  auto Effect = LatticeJoinEffect::Unchanged;
+  auto Effect = LatticeEffect::Unchanged;
 
   // By the API, `PrevEnv` is a previous version of the environment for the same
   // block, so we have some guarantees about its shape. In particular, it will
@@ -646,7 +647,7 @@ LatticeJoinEffect Environment::widen(const Environment &PrevEnv,
       ExprToLoc.size() != PrevEnv.ExprToLoc.size() ||
       ExprToVal.size() != PrevEnv.ExprToVal.size() ||
       LocToVal.size() != PrevEnv.LocToVal.size())
-    Effect = LatticeJoinEffect::Changed;
+    Effect = LatticeEffect::Changed;
 
   return Effect;
 }
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index bea00ab1a1f0..b0b579d2bc19 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -805,6 +805,25 @@ public:
     else
       JoinedVal.setProperty("is_null", JoinedEnv.makeTopBoolValue());
   }
+
+  std::optional<WidenResult> widen(QualType Type, Value &Prev,
+                                   const Environment &PrevEnv, Value &Current,
+                                   Environment &CurrentEnv) override {
+    switch (compare(Type, Prev, PrevEnv, Current, CurrentEnv)) {
+    case ComparisonResult::Same:
+      return WidenResult{&Current, LatticeJoinEffect::Unchanged};
+    case ComparisonResult::Different: {
+      auto &CurPtr = cast<PointerValue>(Current);
+      auto &WidenedPtr =
+          CurrentEnv.create<PointerValue>(CurPtr.getPointeeLoc());
+      WidenedPtr.setProperty("is_null", CurrentEnv.makeTopBoolValue());
+      return WidenResult{&WidenedPtr, LatticeJoinEffect::Changed};
+    }
+    case ComparisonResult::Unknown:
+      return std::nullopt;
+    }
+    llvm_unreachable("all cases in switch covered");
+  }
 };
 
 class WideningTest : public Test {
@@ -846,7 +865,6 @@ TEST_F(WideningTest, JoinDistinctValuesWithDistinctProperties) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p1", "p2", "p3"));
         const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
         const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
         const Environment &Env3 = getEnvironmentAtAnnotation(Results, "p3");
@@ -889,8 +907,6 @@ TEST_F(WideningTest, JoinDistinctValuesWithSameProperties) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(),
-                    UnorderedElementsAre("p1", "p2", "p3", "p4"));
         const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
         const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
         const Environment &Env3 = getEnvironmentAtAnnotation(Results, "p3");
@@ -929,19 +945,11 @@ TEST_F(WideningTest, DistinctPointersToTheSameLocationAreEquivalent) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
-
-        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
-        ASSERT_THAT(FooDecl, NotNull());
-
-        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
-        ASSERT_THAT(BarDecl, NotNull());
-
-        const auto *FooLoc =
-            cast<ScalarStorageLocation>(Env.getStorageLocation(*FooDecl));
-        const auto *BarVal = cast<PointerValue>(Env.getValue(*BarDecl));
-        EXPECT_EQ(&BarVal->getPointeeLoc(), FooLoc);
+        const auto &FooLoc =
+            getLocForDecl<ScalarStorageLocation>(ASTCtx, Env, "Foo");
+        const auto &BarVal = getValueForDecl<PointerValue>(ASTCtx, Env, "Bar");
+        EXPECT_EQ(&BarVal.getPointeeLoc(), &FooLoc);
       });
 }
 
@@ -963,18 +971,38 @@ TEST_F(WideningTest, DistinctValuesWithSamePropertiesAreEquivalent) {
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
-
-        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
-        ASSERT_THAT(FooDecl, NotNull());
-
-        const auto *FooVal = Env.getValue(*FooDecl);
-        EXPECT_EQ(FooVal->getProperty("is_null"),
+        const auto &FooVal = getValueForDecl<Value>(ASTCtx, Env, "Foo");
+        EXPECT_EQ(FooVal.getProperty("is_null"),
                   &Env.getBoolLiteralValue(false));
       });
 }
 
+TEST_F(WideningTest, DistinctValuesWithDifferentPropertiesWidenedToTop) {
+  std::string Code = R"(
+    void target(bool Cond) {
+      int *Foo;
+      int i = 0;
+      Foo = nullptr;
+      while (Cond) {
+        Foo = &i;
+      }
+      (void)0;
+      /*[[p]]*/
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+        const auto &FooVal = getValueForDecl<Value>(ASTCtx, Env, "Foo");
+        ASSERT_THAT(FooVal.getProperty("is_null"), NotNull());
+        EXPECT_TRUE(areEquivalentValues(*FooVal.getProperty("is_null"),
+                                        Env.makeTopBoolValue()));
+      });
+}
+
 class FlowConditionTest : public Test {
 protected:
   template <typename Matcher>
-- 
cgit v1.2.3


From 7b5255297dca377a37c8df066e9d9749ab96cfad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 4 Apr 2024 13:40:08 +0100
Subject: [mlir][test] Make SME e2e tests require an emulator (#86489)

Integration tests for ArmSME require an emulator (there's no hardware
available). Make sure that CMake complains if `MLIR_RUN_ARM_SME_TESTS`
is set while `ARM_EMULATOR_EXECUTABLE` is empty.

I'm also adding a note in the docs for future reference.
---
 mlir/cmake/modules/MLIRCheckHardwareFeatures.cmake | 101 +++++++++++++++++++++
 mlir/docs/Dialects/ArmSME.md                       |  10 +-
 mlir/test/CMakeLists.txt                           |   6 ++
 3 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 mlir/cmake/modules/MLIRCheckHardwareFeatures.cmake

diff --git a/mlir/cmake/modules/MLIRCheckHardwareFeatures.cmake b/mlir/cmake/modules/MLIRCheckHardwareFeatures.cmake
new file mode 100644
index 000000000000..fff0424593fb
--- /dev/null
+++ b/mlir/cmake/modules/MLIRCheckHardwareFeatures.cmake
@@ -0,0 +1,101 @@
+# A collection of helper CMake functions to detect hardware capabilities. At
+# the moment these are used when configuring MLIR integration tests.
+
+# Checks whether the specified hardware capability is supported by the host
+# Linux system. This is implemented by checking auxiliary vector feature
+# provided by the Linux kernel.
+#
+# check_hwcap(
+#   hwcap_spec
+#   output_var
+# )
+#
+# hwcap_spec - HWCAP value to check - these are defined in hwcap.h in the Linux
+#              kernel.
+#
+# output_var - Output variable to use to save the results (TRUE for supported,
+#              FALSE for not supported).
+#
+# EXAMPLES:
+#
+# check_hwcap("HWCAP2_SME" SME_EMULATOR_REQUIRED)
+#
+function(check_hwcap hwcap_spec output)
+    set(hwcap_test_src
+      [====[
+      #include <asm/hwcap.h>
+      #include <sys/auxv.h>
+
+      int main(void)
+      {
+          long hwcaps = getauxval(AT_<HWCAP_VEC>);
+          return (hwcaps & <HWCAP_SPEC>) != 0;
+      }
+      ]====]
+    )
+
+    # Extract from $hwcap_spec whether this is AT_HWCAP or AT_HWCAP2
+    string(FIND ${hwcap_spec} "_" wsloc)
+    string(SUBSTRING ${hwcap_spec} 0 ${wsloc} hwcap_vec)
+
+    string(REPLACE "<HWCAP_VEC>" ${hwcap_vec} hwcap_test_src "${hwcap_test_src}")
+    string(REPLACE "<HWCAP_SPEC>" ${hwcap_spec} hwcap_test_src "${hwcap_test_src}")
+
+    set(hwcap_test_file ${CMAKE_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/hwcap_check.c)
+    file(WRITE ${hwcap_test_file} "${hwcap_test_src}")
+
+    # Compile _and_ run
+    try_run(
+        test_run_result test_compile_result
+        ${CMAKE_BINARY_DIR}
+        ${hwcap_test_file}
+    )
+    # Compilation will fail if hwcap_spec is not defined - this usually means
+    # that your Linux kernel is too old.
+    if(${test_compile_result} AND (DEFINED test_run_result))
+      message(${test_run_result})
+      message(STATUS "Checking whether ${hwcap_spec} is supported by the host system: ${test_run_result}")
+      set(${output} ${test_run_result} PARENT_SCOPE)
+    else()
+      message(STATUS "Checking whether ${hwcap_spec} is supported by the host system: FALSE")
+    endif()
+endfunction(check_hwcap)
+
+# For the given group of e2e tests (defined by the `mlir_e2e_tests` flag),
+# checks whether an emulator is required. If yes, verifies that the
+# corresponding CMake var pointing to an emulator (`emulator_exec`) has been
+# set.
+#
+# check_emulator(
+#   mlir_e2e_tests
+#   hwcap_spec
+#   emulator_exec
+# )
+#
+# mlir_e2e_tests  - MLIR CMake variables corresponding to the group of e2e tests
+#                   to check
+# hwcap_spec      - HWCAP value to check. This should correspond to the hardware
+#                   capabilities required by the tests to be checked. Possible
+#                   values are defined in hwcap.h in the Linux kernel.
+# emulator_exec   - variable the defines the emulator (ought to be set if
+#                   required, can be empty otherwise).
+#
+# EXAMPLES:
+#
+#  check_emulator(MLIR_RUN_ARM_SVE_TESTS "HWCAP_SVE" ARM_EMULATOR_EXECUTABLE)
+#
+function(check_emulator mlir_e2e_tests hwcap_spec emulator_exec)
+  if (NOT ${mlir_e2e_tests})
+    return()
+  endif()
+
+  check_hwcap(${hwcap_spec} emulator_not_required)
+  if (${emulator_not_required})
+    return()
+  endif()
+
+  if (${emulator_exec} STREQUAL "")
+    message(FATAL_ERROR "${mlir_e2e_tests} requires an emulator, but ${emulator_exec} is not set")
+  endif()
+
+endfunction()
diff --git a/mlir/docs/Dialects/ArmSME.md b/mlir/docs/Dialects/ArmSME.md
index 7326150bcd11..ce0a76ed60f1 100644
--- a/mlir/docs/Dialects/ArmSME.md
+++ b/mlir/docs/Dialects/ArmSME.md
@@ -6,7 +6,7 @@ This dialect defines custom and LLVM IR intrinsic operations that are used to
 target Arm Scalable Matrix Extension. Through the available conversion and
 ArmSME passes you can, for example, lower a
 [linalg.matmul](https://mlir.llvm.org/docs/Dialects/Linalg/#linalgmatmul-linalgmatmulop)
-opereation to Arm SME
+operation to Arm SME
 [FMOPA](https://developer.arm.com/documentation/ddi0602/2023-03/SME-Instructions/FMOPA--widening---Half-precision-floating-point-sum-of-outer-products-and-accumulate-)
 (floating-point outer product) operations. See one of the in-tree end-to-end
 integration tests for reference:
@@ -14,6 +14,14 @@ integration tests for reference:
 * [Linalg/CPU/ArmSME/matmul.mlir](https://github.com/llvm/llvm-project/blob/main/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir)
 * [Vector/CPU/ArmSME/test-outerproduct-f64.mlir](https://github.com/llvm/llvm-project/blob/main/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir)
 
+In order to run ArmSME integration tests, include these flags in the CMake
+invocation when configuring LLVM and MLIR:
+```bash
+  -DMLIR_INCLUDE_INTEGRATION_TESTS=On
+  -DMLIR_RUN_ARM_SME_TESTS=On
+  -DARM_EMULATOR_EXECUTABLE=<path-to-emulator> 
+```
+
 These tests are run "post-commit" by the
 [clang-aarch64-sve-vla](https://lab.llvm.org/buildbot/#/builders/197) LLVM
 BuildBot worker.
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index baf07ea1f010..5319a9cb33e0 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -1,3 +1,5 @@
+include(MLIRCheckHardwareFeatures)
+
 add_subdirectory(CAPI)
 add_subdirectory(lib)
 
@@ -39,6 +41,10 @@ if (MLIR_INCLUDE_INTEGRATION_TESTS)
   option(MLIR_RUN_ARM_SVE_TESTS "Run Arm SVE tests.")
   option(MLIR_RUN_ARM_SME_TESTS "Run Arm SME tests.")
 
+  # Check whether an emulator is required - if yes then make sure that it's
+  # been set.
+  check_emulator(MLIR_RUN_ARM_SVE_TESTS "HWCAP_SVE" ARM_EMULATOR_EXECUTABLE)
+  check_emulator(MLIR_RUN_ARM_SME_TESTS "HWCAP2_SME" ARM_EMULATOR_EXECUTABLE)
 
   # The native target may not be enabled when cross compiling, raise an error.
   if(NOT MLIR_ENABLE_EXECUTION_ENGINE)
-- 
cgit v1.2.3


From 8f9903db8aed496a2cdc75a614e6d2c65c1acc07 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Thu, 4 Apr 2024 14:41:50 +0200
Subject: [LLD][COFF][NFC] Use getMachineArchType helper. (#87495)

It's similar to #87370, but for lld-link.
---
 lld/COFF/Chunks.cpp      | 28 ++++++++++++----------------
 lld/COFF/Chunks.h        |  6 ++++++
 lld/COFF/Driver.cpp      |  1 -
 lld/COFF/SymbolTable.cpp |  1 -
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 39f4575031be..004d71097387 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -437,19 +437,17 @@ void SectionChunk::applyRelocation(uint8_t *off,
   // Compute the RVA of the relocation for relative relocations.
   uint64_t p = rva + rel.VirtualAddress;
   uint64_t imageBase = file->ctx.config.imageBase;
-  switch (getMachine()) {
-  case AMD64:
+  switch (getArch()) {
+  case Triple::x86_64:
     applyRelX64(off, rel.Type, os, s, p, imageBase);
     break;
-  case I386:
+  case Triple::x86:
     applyRelX86(off, rel.Type, os, s, p, imageBase);
     break;
-  case ARMNT:
+  case Triple::thumb:
     applyRelARM(off, rel.Type, os, s, p, imageBase);
     break;
-  case ARM64:
-  case ARM64EC:
-  case ARM64X:
+  case Triple::aarch64:
     applyRelARM64(off, rel.Type, os, s, p, imageBase);
     break;
   default:
@@ -516,27 +514,25 @@ void SectionChunk::addAssociative(SectionChunk *child) {
 }
 
 static uint8_t getBaserelType(const coff_relocation &rel,
-                              llvm::COFF::MachineTypes machine) {
-  switch (machine) {
-  case AMD64:
+                              Triple::ArchType arch) {
+  switch (arch) {
+  case Triple::x86_64:
     if (rel.Type == IMAGE_REL_AMD64_ADDR64)
       return IMAGE_REL_BASED_DIR64;
     if (rel.Type == IMAGE_REL_AMD64_ADDR32)
       return IMAGE_REL_BASED_HIGHLOW;
     return IMAGE_REL_BASED_ABSOLUTE;
-  case I386:
+  case Triple::x86:
     if (rel.Type == IMAGE_REL_I386_DIR32)
       return IMAGE_REL_BASED_HIGHLOW;
     return IMAGE_REL_BASED_ABSOLUTE;
-  case ARMNT:
+  case Triple::thumb:
     if (rel.Type == IMAGE_REL_ARM_ADDR32)
       return IMAGE_REL_BASED_HIGHLOW;
     if (rel.Type == IMAGE_REL_ARM_MOV32T)
       return IMAGE_REL_BASED_ARM_MOV32T;
     return IMAGE_REL_BASED_ABSOLUTE;
-  case ARM64:
-  case ARM64EC:
-  case ARM64X:
+  case Triple::aarch64:
     if (rel.Type == IMAGE_REL_ARM64_ADDR64)
       return IMAGE_REL_BASED_DIR64;
     return IMAGE_REL_BASED_ABSOLUTE;
@@ -551,7 +547,7 @@ static uint8_t getBaserelType(const coff_relocation &rel,
 // Only called when base relocation is enabled.
 void SectionChunk::getBaserels(std::vector<Baserel> *res) {
   for (const coff_relocation &rel : getRelocs()) {
-    uint8_t ty = getBaserelType(rel, getMachine());
+    uint8_t ty = getBaserelType(rel, getArch());
     if (ty == IMAGE_REL_BASED_ABSOLUTE)
       continue;
     Symbol *target = file->getSymbol(rel.SymbolTableIndex);
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 7b6bdeae4234..bb919037ecc2 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/WindowsMachineFlag.h"
 #include <utility>
 #include <vector>
 
@@ -116,6 +117,7 @@ public:
   bool isHotPatchable() const;
 
   MachineTypes getMachine() const;
+  llvm::Triple::ArchType getArch() const;
   std::optional<chpe_range_type> getArm64ECRangeType() const;
 
 protected:
@@ -437,6 +439,10 @@ inline MachineTypes Chunk::getMachine() const {
   return static_cast<const NonSectionChunk *>(this)->getMachine();
 }
 
+inline llvm::Triple::ArchType Chunk::getArch() const {
+  return llvm::getMachineArchType(getMachine());
+}
+
 inline std::optional<chpe_range_type> Chunk::getArm64ECRangeType() const {
   // Data sections don't need codemap entries.
   if (!(getOutputCharacteristics() & llvm::COFF::IMAGE_SCN_MEM_EXECUTE))
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index ea37f8deecf0..b0365b5b9441 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -31,7 +31,6 @@
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/COFFModuleDefinition.h"
-#include "llvm/Object/WindowsMachineFlag.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 44aa506d2c35..3accf24663c6 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -19,7 +19,6 @@
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/LTO/LTO.h"
-#include "llvm/Object/WindowsMachineFlag.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <utility>
-- 
cgit v1.2.3


From e0e615efac522365591119165a7691ce869de512 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann@gmail.com>
Date: Thu, 4 Apr 2024 15:07:18 +0200
Subject: Revert "[MLIR][LLVM][Mem2Reg] Relax type equality requirement for
 load and store (#87504)" (#87631)

This reverts commit d6e458219897fad0e460e663833b2190af48c06d as it
violates an assumption of Mem2Reg's block argument creation. Mem2Reg
strongly assumes that all involved values have the same type as the
alloca, which was relaxed by this PR. Therefore, branches got created
that jumped to basic blocks with differently typed block arguments.
---
 mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp |  38 +----
 mlir/test/Dialect/LLVMIR/mem2reg.mlir         | 197 --------------------------
 2 files changed, 4 insertions(+), 231 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index f8fd3bdaff4a..06c1fdd2eb2d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -126,13 +126,6 @@ Value LLVM::StoreOp::getStored(const MemorySlot &slot, RewriterBase &rewriter) {
   return getValue();
 }
 
-/// Checks that two types are the same or can be cast into one another.
-static bool areCastCompatible(const DataLayout &layout, Type lhs, Type rhs) {
-  return lhs == rhs || (!isa<LLVM::LLVMStructType, LLVM::LLVMArrayType>(lhs) &&
-                        !isa<LLVM::LLVMStructType, LLVM::LLVMArrayType>(rhs) &&
-                        layout.getTypeSize(lhs) == layout.getTypeSize(rhs));
-}
-
 bool LLVM::LoadOp::canUsesBeRemoved(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
     SmallVectorImpl<OpOperand *> &newBlockingUses,
@@ -145,29 +138,7 @@ bool LLVM::LoadOp::canUsesBeRemoved(
   // be removed (provided it loads the exact stored value and is not
   // volatile).
   return blockingUse == slot.ptr && getAddr() == slot.ptr &&
-         areCastCompatible(dataLayout, getResult().getType(), slot.elemType) &&
-         !getVolatile_();
-}
-
-/// Constructs operations that convert `inputValue` into a new value of type
-/// `targetType`. Assumes that this conversion is possible.
-static Value createConversionSequence(RewriterBase &rewriter, Location loc,
-                                      Value inputValue, Type targetType) {
-  if (inputValue.getType() == targetType)
-    return inputValue;
-
-  if (!isa<LLVM::LLVMPointerType>(targetType) &&
-      !isa<LLVM::LLVMPointerType>(inputValue.getType()))
-    return rewriter.createOrFold<LLVM::BitcastOp>(loc, targetType, inputValue);
-
-  if (!isa<LLVM::LLVMPointerType>(targetType))
-    return rewriter.createOrFold<LLVM::PtrToIntOp>(loc, targetType, inputValue);
-
-  if (!isa<LLVM::LLVMPointerType>(inputValue.getType()))
-    return rewriter.createOrFold<LLVM::IntToPtrOp>(loc, targetType, inputValue);
-
-  return rewriter.createOrFold<LLVM::AddrSpaceCastOp>(loc, targetType,
-                                                      inputValue);
+         getResult().getType() == slot.elemType && !getVolatile_();
 }
 
 DeletionKind LLVM::LoadOp::removeBlockingUses(
@@ -175,9 +146,7 @@ DeletionKind LLVM::LoadOp::removeBlockingUses(
     RewriterBase &rewriter, Value reachingDefinition) {
   // `canUsesBeRemoved` checked this blocking use must be the loaded slot
   // pointer.
-  Value newResult = createConversionSequence(
-      rewriter, getLoc(), reachingDefinition, getResult().getType());
-  rewriter.replaceAllUsesWith(getResult(), newResult);
+  rewriter.replaceAllUsesWith(getResult(), reachingDefinition);
   return DeletionKind::Delete;
 }
 
@@ -192,7 +161,8 @@ bool LLVM::StoreOp::canUsesBeRemoved(
   // fine, provided we are currently promoting its target value. Don't allow a
   // store OF the slot pointer, only INTO the slot pointer.
   return blockingUse == slot.ptr && getAddr() == slot.ptr &&
-         getValue() != slot.ptr && !getVolatile_();
+         getValue() != slot.ptr && getValue().getType() == slot.elemType &&
+         !getVolatile_();
 }
 
 DeletionKind LLVM::StoreOp::removeBlockingUses(
diff --git a/mlir/test/Dialect/LLVMIR/mem2reg.mlir b/mlir/test/Dialect/LLVMIR/mem2reg.mlir
index d6d5e1bdc93c..90e56c1166ed 100644
--- a/mlir/test/Dialect/LLVMIR/mem2reg.mlir
+++ b/mlir/test/Dialect/LLVMIR/mem2reg.mlir
@@ -697,200 +697,3 @@ llvm.func @transitive_reaching_def() -> !llvm.ptr {
   %3 = llvm.load %1 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr
   llvm.return %3 : !llvm.ptr
 }
-
-// -----
-
-// CHECK-LABEL: @load_int_from_float
-llvm.func @load_int_from_float() -> i32 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NOT: llvm.alloca
-  %1 = llvm.alloca %0 x f32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
-  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
-  // CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[UNDEF]] : f32 to i32
-  // CHECK: llvm.return %[[BITCAST:.*]]
-  llvm.return %2 : i32
-}
-
-// -----
-
-// CHECK-LABEL: @load_float_from_int
-llvm.func @load_float_from_int() -> f32 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NOT: llvm.alloca
-  %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> f32
-  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
-  // CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[UNDEF]] : i32 to f32
-  // CHECK: llvm.return %[[BITCAST:.*]]
-  llvm.return %2 : f32
-}
-
-// -----
-
-// CHECK-LABEL: @load_int_from_vector
-llvm.func @load_int_from_vector() -> i32 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NOT: llvm.alloca
-  %1 = llvm.alloca %0 x vector<2xi16> : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
-  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
-  // CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[UNDEF]] : vector<2xi16> to i32
-  // CHECK: llvm.return %[[BITCAST:.*]]
-  llvm.return %2 : i32
-}
-
-// -----
-
-// LLVM arrays cannot be bitcasted, so the following cannot be promoted.
-
-// CHECK-LABEL: @load_int_from_array
-llvm.func @load_int_from_array() -> i32 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: llvm.alloca
-  %1 = llvm.alloca %0 x !llvm.array<2 x i16> : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
-  // CHECK-NOT: llvm.bitcast
-  llvm.return %2 : i32
-}
-
-// -----
-
-// CHECK-LABEL: @store_int_to_float
-// CHECK-SAME: %[[ARG:.*]]: i32
-llvm.func @store_int_to_float(%arg: i32) -> i32 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NOT: llvm.alloca
-  %1 = llvm.alloca %0 x f32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  llvm.store %arg, %1 {alignment = 4 : i64} : i32, !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
-  // CHECK: llvm.return %[[ARG]]
-  llvm.return %2 : i32
-}
-
-// -----
-
-// CHECK-LABEL: @store_float_to_int
-// CHECK-SAME: %[[ARG:.*]]: f32
-llvm.func @store_float_to_int(%arg: f32) -> i32 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NOT: llvm.alloca
-  %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  llvm.store %arg, %1 {alignment = 4 : i64} : f32, !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
-  // CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[ARG]] : f32 to i32
-  // CHECK: llvm.return %[[BITCAST]]
-  llvm.return %2 : i32
-}
-
-// -----
-
-// CHECK-LABEL: @store_int_to_vector
-// CHECK-SAME: %[[ARG:.*]]: i32
-llvm.func @store_int_to_vector(%arg: i32) -> vector<4xi8> {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NOT: llvm.alloca
-  %1 = llvm.alloca %0 x vector<2xi16> {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  llvm.store %arg, %1 {alignment = 4 : i64} : i32, !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> vector<4xi8>
-  // CHECK: %[[BITCAST:.*]] = llvm.bitcast %[[ARG]] : i32 to vector<4xi8>
-  // CHECK: llvm.return %[[BITCAST]]
-  llvm.return %2 : vector<4xi8>
-}
-
-// -----
-
-// CHECK-LABEL: @load_ptr_from_int
-llvm.func @load_ptr_from_int() -> !llvm.ptr {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NOT: llvm.alloca
-  %1 = llvm.alloca %0 x i64 {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> !llvm.ptr
-  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
-  // CHECK: %[[CAST:.*]] = llvm.inttoptr %[[UNDEF]] : i64 to !llvm.ptr
-  // CHECK: llvm.return %[[CAST:.*]]
-  llvm.return %2 : !llvm.ptr
-}
-
-// -----
-
-// CHECK-LABEL: @load_int_from_ptr
-llvm.func @load_int_from_ptr() -> i64 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NOT: llvm.alloca
-  %1 = llvm.alloca %0 x !llvm.ptr {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i64
-  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
-  // CHECK: %[[CAST:.*]] = llvm.ptrtoint %[[UNDEF]] : !llvm.ptr to i64
-  // CHECK: llvm.return %[[CAST:.*]]
-  llvm.return %2 : i64
-}
-
-// -----
-
-// CHECK-LABEL: @load_ptr_addrspace_cast
-llvm.func @load_ptr_addrspace_cast() -> !llvm.ptr<2> {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NOT: llvm.alloca
-  %1 = llvm.alloca %0 x !llvm.ptr<1> {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> !llvm.ptr<2>
-  // CHECK: %[[UNDEF:.*]] = llvm.mlir.undef
-  // CHECK: %[[CAST:.*]] = llvm.addrspacecast %[[UNDEF]] : !llvm.ptr<1> to !llvm.ptr<2>
-  // CHECK: llvm.return %[[CAST:.*]]
-  llvm.return %2 : !llvm.ptr<2>
-}
-
-// -----
-
-// CHECK-LABEL: @load_smaller_int
-llvm.func @load_smaller_int() -> i16 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: llvm.alloca
-  %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i16
-  llvm.return %2 : i16
-}
-
-// -----
-
-// CHECK-LABEL: @load_different_type_smaller
-llvm.func @load_different_type_smaller() -> f32 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: llvm.alloca
-  %1 = llvm.alloca %0 x i64 {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> f32
-  llvm.return %2 : f32
-}
-
-// -----
-
-// This alloca is too small for the load, still, mem2reg should not touch it.
-
-// CHECK-LABEL: @impossible_load
-llvm.func @impossible_load() -> f64 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: llvm.alloca
-  %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> f64
-  llvm.return %2 : f64
-}
-
-// -----
-
-// Verifies that mem2reg does not introduce address space casts of pointers
-// with different bitsize.
-
-module attributes { dlti.dl_spec = #dlti.dl_spec<
-  #dlti.dl_entry<!llvm.ptr<1>, dense<[32, 64, 64]> : vector<3xi64>>,
-  #dlti.dl_entry<!llvm.ptr<2>, dense<[64, 64, 64]> : vector<3xi64>>
->} {
-
-  // CHECK-LABEL: @load_ptr_addrspace_cast_different_size
-  llvm.func @load_ptr_addrspace_cast_different_size() -> !llvm.ptr<2> {
-    %0 = llvm.mlir.constant(1 : i32) : i32
-    // CHECK: llvm.alloca
-    %1 = llvm.alloca %0 x !llvm.ptr<1> {alignment = 4 : i64} : (i32) -> !llvm.ptr
-    %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> !llvm.ptr<2>
-    llvm.return %2 : !llvm.ptr<2>
-  }
-}
-- 
cgit v1.2.3


From 417a068b63c01d79511fe3732dd52377b05d06fc Mon Sep 17 00:00:00 2001
From: Mikael Holmen <mikael.holmen@ericsson.com>
Date: Thu, 4 Apr 2024 14:57:10 +0200
Subject: [clang][CGBlocks] Remove unused variable "refType" [NFC]

Without the change gcc warned like
 ../../clang/lib/CodeGen/CGBlocks.cpp:965:21: warning: unused variable 'refType' [-Wunused-variable]
   965 |     } else if (auto refType = type->getAs<ReferenceType>()) {
       |                     ^~~~~~~
---
 clang/lib/CodeGen/CGBlocks.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index a01f2c7c9798..47f063b5501c 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -962,7 +962,7 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
       }
 
     // If it's a reference variable, copy the reference into the block field.
-    } else if (auto refType = type->getAs<ReferenceType>()) {
+    } else if (type->getAs<ReferenceType>()) {
       Builder.CreateStore(src.emitRawPointer(*this), blockField);
 
       // If type is const-qualified, copy the value into the block field.
-- 
cgit v1.2.3


From a1f4ac7704255627ac33ad67a22be5ac030f6179 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Thu, 4 Apr 2024 21:12:25 +0800
Subject: [SEH] Ignore EH pad check for internal intrinsics (#79694)

Intrinsics like @llvm.seh.scope.begin and @llvm.seh.scope.end which do
not throw do not need funclets in catchpads or cleanuppads.

Fixes #69428

Co-authored-by: Robert Cox <robert.cox@intel.com>

---------

Co-authored-by: Robert Cox <robert.cox@intel.com>
---
 llvm/lib/IR/Verifier.cpp      |  5 +++++
 llvm/test/Verifier/pr69428.ll | 48 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 llvm/test/Verifier/pr69428.ll

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 33f358440a31..ba0b72312702 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4343,6 +4343,11 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
     if (auto *II = dyn_cast<InvokeInst>(TI)) {
       Check(II->getUnwindDest() == BB && II->getNormalDest() != BB,
             "EH pad must be jumped to via an unwind edge", ToPad, II);
+      auto *CalledFn =
+          dyn_cast<Function>(II->getCalledOperand()->stripPointerCasts());
+      if (CalledFn && CalledFn->isIntrinsic() && II->doesNotThrow() &&
+          !IntrinsicInst::mayLowerToFunctionCall(CalledFn->getIntrinsicID()))
+        continue;
       if (auto Bundle = II->getOperandBundle(LLVMContext::OB_funclet))
         FromPad = Bundle->Inputs[0];
       else
diff --git a/llvm/test/Verifier/pr69428.ll b/llvm/test/Verifier/pr69428.ll
new file mode 100644
index 000000000000..be8733bea1ab
--- /dev/null
+++ b/llvm/test/Verifier/pr69428.ll
@@ -0,0 +1,48 @@
+; RUN: llvm-as -disable-output %s
+
+%struct._List_node_emplace_op2 = type { i8 }
+
+@"?_List@@3HA" = global i32 0, align 4
+
+define void @"?ExecutionEngineaddExecutableDependency@@YAXXZ"() personality ptr @__CxxFrameHandler3 {
+entry:
+  %agg.tmp.ensured.i = alloca %struct._List_node_emplace_op2, align 1
+  %0 = load i32, ptr @"?_List@@3HA", align 4
+  %call.i = call noundef ptr @"??0?$_List_node_emplace_op2@H@@QEAA@H@Z"(ptr %agg.tmp.ensured.i, i32 %0)
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont.i unwind label %ehcleanup.i
+
+invoke.cont.i:                                    ; preds = %entry
+  invoke void @llvm.seh.scope.end()
+          to label %invoke.cont2.i unwind label %ehcleanup.i
+
+invoke.cont2.i:                                   ; preds = %invoke.cont.i
+  call void @"??1?$_List_node_emplace_op2@H@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6
+  unreachable
+
+ehcleanup.i:                                      ; preds = %invoke.cont.i, %entry
+  %1 = cleanuppad within none []
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont.i.i unwind label %ehcleanup.i.i
+
+invoke.cont.i.i:                                  ; preds = %ehcleanup.i
+  invoke void @llvm.seh.scope.end()
+          to label %"??1?$_List_node_emplace_op2@H@@QEAA@XZ.exit.i" unwind label %ehcleanup.i.i
+
+ehcleanup.i.i:                                    ; preds = %invoke.cont.i.i, %ehcleanup.i
+  %2 = cleanuppad within %1 []
+  call void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6 [ "funclet"(token %2) ]
+  cleanupret from %2 unwind to caller
+
+"??1?$_List_node_emplace_op2@H@@QEAA@XZ.exit.i":  ; preds = %invoke.cont.i.i
+  call void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6 [ "funclet"(token %1) ]
+  cleanupret from %1 unwind to caller
+}
+
+declare i32 @__CxxFrameHandler3(...)
+declare void @llvm.seh.scope.begin()
+declare void @llvm.seh.scope.end()
+
+declare void @"??1?$_List_node_emplace_op2@H@@QEAA@XZ"(ptr)
+declare void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr)
+declare ptr @"??0?$_List_node_emplace_op2@H@@QEAA@H@Z"(ptr, i32)
-- 
cgit v1.2.3


From ea88bb16485a34db58333637d8558ff57e6a1329 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Thu, 4 Apr 2024 15:31:54 +0200
Subject: [flang] Return 1 in ERROR STOP without user provided stop-code
 (#87501)

See F'2023 section 11.4: "If the stop-code in an ERROR STOP statement is
of type character or does not appear, it is recommended that a
processor-dependent nonzero value be supplied as the process exit
status"

Fixes https://github.com/llvm/llvm-project/issues/66581.
---
 flang/lib/Lower/Runtime.cpp         | 9 +++++----
 flang/test/Lower/stop-statement.f90 | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index e7695929623f..3474832bdb22 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -55,6 +55,8 @@ static void genUnreachable(fir::FirOpBuilder &builder, mlir::Location loc) {
 void Fortran::lower::genStopStatement(
     Fortran::lower::AbstractConverter &converter,
     const Fortran::parser::StopStmt &stmt) {
+  const bool isError = std::get<Fortran::parser::StopStmt::Kind>(stmt.t) ==
+                       Fortran::parser::StopStmt::Kind::ErrorStop;
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Location loc = converter.getCurrentLocation();
   Fortran::lower::StatementContext stmtCtx;
@@ -94,13 +96,12 @@ void Fortran::lower::genStopStatement(
   } else {
     callee = fir::runtime::getRuntimeFunc<mkRTKey(StopStatement)>(loc, builder);
     calleeType = callee.getFunctionType();
-    operands.push_back(
-        builder.createIntegerConstant(loc, calleeType.getInput(0), 0));
+    // Default to values are advised in F'2023 11.4 p2.
+    operands.push_back(builder.createIntegerConstant(
+        loc, calleeType.getInput(0), isError ? 1 : 0));
   }
 
   // Second operand indicates ERROR STOP
-  bool isError = std::get<Fortran::parser::StopStmt::Kind>(stmt.t) ==
-                 Fortran::parser::StopStmt::Kind::ErrorStop;
   operands.push_back(builder.createIntegerConstant(
       loc, calleeType.getInput(operands.size()), isError));
 
diff --git a/flang/test/Lower/stop-statement.f90 b/flang/test/Lower/stop-statement.f90
index bc94a7ee23a6..cf0665cf5dbd 100644
--- a/flang/test/Lower/stop-statement.f90
+++ b/flang/test/Lower/stop-statement.f90
@@ -21,10 +21,10 @@ end subroutine
 ! CHECK-LABEL: stop_error
 subroutine stop_error()
   error stop
- ! CHECK-DAG: %[[c0:.*]] = arith.constant 0 : i32
+ ! CHECK-DAG: %[[c_1:.*]] = arith.constant 1 : i32
  ! CHECK-DAG: %[[true:.*]] = arith.constant true
  ! CHECK-DAG: %[[false:.*]] = arith.constant false
- ! CHECK: fir.call @_Fortran{{.*}}StopStatement(%[[c0]], %[[true]], %[[false]])
+ ! CHECK: fir.call @_Fortran{{.*}}StopStatement(%[[c_1]], %[[true]], %[[false]])
  ! CHECK-NEXT: fir.unreachable
 end subroutine
 
-- 
cgit v1.2.3


From 1bce411073a1deafef998d0fe9e3ae74c4cef1e5 Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@debian.org>
Date: Thu, 4 Apr 2024 21:51:25 +0800
Subject: MIPS/Clang: Set HasUnalignedAccess false if +strict-align (#87257)

TargetInfo has HasUnalignedAccess support now. For MIPSr6, we should set
it according strict-align.

For pre-R6, we always set strict-align and HasUnalignedAccess to false.
---
 clang/lib/Basic/Targets/Mips.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index c9dcf434c93b..0d6e4b4d0808 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -318,6 +318,7 @@ public:
     FPMode = isFP64Default() ? FP64 : FPXX;
     NoOddSpreg = false;
     bool OddSpregGiven = false;
+    bool StrictAlign = false;
 
     for (const auto &Feature : Features) {
       if (Feature == "+single-float")
@@ -330,6 +331,10 @@ public:
         IsMicromips = true;
       else if (Feature == "+mips32r6" || Feature == "+mips64r6")
         HasUnalignedAccess = true;
+      // We cannot be sure that the order of strict-align vs mips32r6.
+      // Thus we need an extra variable here.
+      else if (Feature == "+strict-align")
+        StrictAlign = true;
       else if (Feature == "+dsp")
         DspRev = std::max(DspRev, DSP1);
       else if (Feature == "+dspr2")
@@ -368,6 +373,9 @@ public:
     if (FPMode == FPXX && !OddSpregGiven)
       NoOddSpreg = true;
 
+    if (StrictAlign)
+      HasUnalignedAccess = false;
+
     setDataLayout();
 
     return true;
-- 
cgit v1.2.3


From 110e933b7ae9150710a48b586fd3da39439079c2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 14:51:23 +0100
Subject: CGOpenMPRuntime.cpp - fix Wparentheses warning. NFC.

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index bc363313dec6..8eb10584699f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2648,9 +2648,9 @@ void CGOpenMPRuntime::emitDistributeStaticInit(
 void CGOpenMPRuntime::emitForStaticFinish(CodeGenFunction &CGF,
                                           SourceLocation Loc,
                                           OpenMPDirectiveKind DKind) {
-  assert(DKind == OMPD_distribute || DKind == OMPD_for ||
-         DKind == OMPD_sections &&
-             "Expected distribute, for, or sections directive kind");
+  assert((DKind == OMPD_distribute || DKind == OMPD_for ||
+          DKind == OMPD_sections) &&
+         "Expected distribute, for, or sections directive kind");
   if (!CGF.HaveInsertPoint())
     return;
   // Call __kmpc_for_static_fini(ident_t *loc, kmp_int32 tid);
-- 
cgit v1.2.3


From 24c256a6b7892bcf98eac531c99d9038cd826ce0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 14:56:31 +0100
Subject: AMDGPULowerBufferFatPointers.cpp - fix Wparentheses warning. NFC.

---
 llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 9083150b3384..20ca633ddee8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1777,8 +1777,8 @@ void SplitPtrStructs::processFunction(Function &F) {
     Originals.push_back(&I);
   for (Instruction *I : Originals) {
     auto [Rsrc, Off] = visit(I);
-    assert((Rsrc && Off) ||
-           (!Rsrc && !Off) && "Can't have a resource but no offset");
+    assert(((Rsrc && Off) || (!Rsrc && !Off)) &&
+           "Can't have a resource but no offset");
     if (Rsrc)
       RsrcParts[I] = Rsrc;
     if (Off)
-- 
cgit v1.2.3


From 6fa2d03bbf86bc01140e3b007554617b029fd3f1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 14:58:42 +0100
Subject: AMDGPULowerBufferFatPointers.cpp - fix Wunused-variable warning. NFC.

---
 llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 20ca633ddee8..1114a8c40114 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1086,7 +1086,7 @@ void SplitPtrStructs::processConditionals() {
       if (MaybeRsrc)
         for (Value *V : Seen)
           FoundRsrcs[cast<Instruction>(V)] = NewRsrc;
-    } else if (auto *SI = dyn_cast<SelectInst>(I)) {
+    } else if (isa<SelectInst>(I)) {
       if (MaybeRsrc) {
         ConditionalTemps.push_back(cast<Instruction>(Rsrc));
         Rsrc->replaceAllUsesWith(*MaybeRsrc);
-- 
cgit v1.2.3


From a69673615bb9f14794056470a32f70f60a52213d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 15:19:07 +0100
Subject: [X86] Haswell/Broadwell - fix (V)ROUND*ri sched behaviours to use
 2*Port1

We were only using the Port23 memory ports and were missing the 2*Port1 uops entirely.

Confirmed by Agner + uops.info/uica
---
 llvm/lib/Target/X86/X86SchedBroadwell.td           |  6 ++---
 llvm/lib/Target/X86/X86SchedHaswell.td             |  9 +++-----
 .../tools/llvm-mca/X86/Broadwell/resources-avx1.s  | 26 +++++++++++-----------
 .../tools/llvm-mca/X86/Broadwell/resources-sse41.s | 18 +++++++--------
 .../tools/llvm-mca/X86/Haswell/resources-avx1.s    | 26 +++++++++++-----------
 .../tools/llvm-mca/X86/Haswell/resources-sse41.s   | 18 +++++++--------
 6 files changed, 49 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index b3ee7a82b917..63ac91028ac9 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -329,11 +329,9 @@ defm : X86WriteRes<WriteDPPSY,     [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4>;
 defm : X86WriteRes<WriteDPPSLd,    [BWPort0,BWPort1,BWPort5,BWPort06,BWPort23], 19, [2,1,1,1,1], 6>;
 defm : X86WriteRes<WriteDPPSYLd,   [BWPort0,BWPort1,BWPort5,BWPort06,BWPort23], 20, [2,1,1,1,1], 6>;
 defm : BWWriteResPair<WriteFSign,     [BWPort5], 1>; // Floating point fabs/fchs.
-defm : X86WriteRes<WriteFRnd,            [BWPort23],  6, [1],   1>; // Floating point rounding.
-defm : X86WriteRes<WriteFRndY,           [BWPort23],  6, [1],   1>; // Floating point rounding (YMM/ZMM).
+defm : BWWriteResPair<WriteFRnd,      [BWPort1], 6, [2], 2, 5>; // Floating point rounding.
+defm : BWWriteResPair<WriteFRndY,     [BWPort1], 6, [2], 2, 6>; // Floating point rounding (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteFRndZ>;
-defm : X86WriteRes<WriteFRndLd,  [BWPort1,BWPort23], 11, [2,1], 3>;
-defm : X86WriteRes<WriteFRndYLd, [BWPort1,BWPort23], 12, [2,1], 3>;
 defm : BWWriteResPair<WriteFLogic,    [BWPort5], 1, [1], 1, 5>; // Floating point and/or/xor logicals.
 defm : BWWriteResPair<WriteFLogicY,   [BWPort5], 1, [1], 1, 6>; // Floating point and/or/xor logicals (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteFLogicZ>;
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 6c301a3cd342..516dc62f1b6d 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -329,12 +329,9 @@ defm : X86WriteRes<WriteDPPSY,     [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4>;
 defm : X86WriteRes<WriteDPPSLd,    [HWPort0,HWPort1,HWPort5,HWPort06,HWPort23], 20, [2,1,1,1,1], 6>;
 defm : X86WriteRes<WriteDPPSYLd,   [HWPort0,HWPort1,HWPort5,HWPort06,HWPort23], 21, [2,1,1,1,1], 6>;
 defm : HWWriteResPair<WriteFSign,  [HWPort0], 1>;
-defm : X86WriteRes<WriteFRnd,            [HWPort23],  6, [1],   1>;
-defm : X86WriteRes<WriteFRndY,           [HWPort23],  6, [1],   1>;
-defm : X86WriteRes<WriteFRndZ,           [HWPort23],  6, [1],   1>; // Unsupported = 1
-defm : X86WriteRes<WriteFRndLd,  [HWPort1,HWPort23], 12, [2,1], 3>;
-defm : X86WriteRes<WriteFRndYLd, [HWPort1,HWPort23], 13, [2,1], 3>;
-defm : X86WriteRes<WriteFRndZLd, [HWPort1,HWPort23], 13, [2,1], 3>; // Unsupported = 1
+defm : HWWriteResPair<WriteFRnd,   [HWPort1], 6, [2], 2, 6>;
+defm : HWWriteResPair<WriteFRndY,  [HWPort1], 6, [2], 2, 7>;
+defm : HWWriteResPair<WriteFRndZ,  [HWPort1], 6, [2], 2, 7>; // Unsupported = 1
 defm : HWWriteResPair<WriteFLogic,  [HWPort5], 1, [1], 1, 6>;
 defm : HWWriteResPair<WriteFLogicY, [HWPort5], 1, [1], 1, 7>;
 defm : HWWriteResPair<WriteFLogicZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
index ca1faf62aa89..1b196b4355a6 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
@@ -1632,17 +1632,17 @@ vzeroupper
 # CHECK-NEXT:  4      17    2.00    *                   vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrsqrtps	(%rax), %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     257.00 216.25 235.25 176.17 176.17 38.00  424.25 3.25   12.67
+# CHECK-NEXT:  -     257.00 216.25 247.25 173.17 173.17 38.00  424.25 3.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2342,17 +2342,17 @@ vzeroupper
 # CHECK-NEXT:  -      -     2.33   0.33   0.50   0.50    -     0.33    -      -     vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrsqrtps	(%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
index dcc535306c85..4865121308bb 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
@@ -243,13 +243,13 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      15    2.00    *                   pmulld	(%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        ptest	%xmm0, %xmm1
 # CHECK-NEXT:  3      7     1.00    *                   ptest	(%rax), %xmm1
-# CHECK-NEXT:  1      6     0.50                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundss	$1, (%rax), %xmm2
 
 # CHECK:      Resources:
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     23.83  22.33  25.67  25.67  5.00   80.33  0.50   1.67
+# CHECK-NEXT:  -      -     23.83  30.33  23.67  23.67  5.00   80.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -358,11 +358,11 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00    -     0.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     ptest	%xmm0, %xmm1
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -     ptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundss	$1, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
index cff60c9ce3ab..05c476079c0f 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
@@ -1632,17 +1632,17 @@ vzeroupper
 # CHECK-NEXT:  4      18    2.00    *                   vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      13    2.00    *                   vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      13    2.00    *                   vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  2      11    1.00    *                   vrsqrtps	(%rax), %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     336.00 215.58 236.58 176.17 176.17 38.00  427.58 3.25   12.67
+# CHECK-NEXT:  -     336.00 215.58 248.58 173.17 173.17 38.00  427.58 3.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2342,17 +2342,17 @@ vzeroupper
 # CHECK-NEXT:  -      -     2.33   0.33   0.50   0.50    -     0.33    -      -     vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrsqrtps	(%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
index c2d07735f1cb..62dfa23a6bad 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
@@ -243,13 +243,13 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      16    2.00    *                   pmulld	(%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        ptest	%xmm0, %xmm1
 # CHECK-NEXT:  3      8     1.00    *                   ptest	(%rax), %xmm1
-# CHECK-NEXT:  1      6     0.50                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundss	$1, (%rax), %xmm2
 
 # CHECK:      Resources:
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     23.83  22.33  25.67  25.67  5.00   80.33  0.50   1.67
+# CHECK-NEXT:  -      -     23.83  30.33  23.67  23.67  5.00   80.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -358,11 +358,11 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00    -     0.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     ptest	%xmm0, %xmm1
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -     ptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundss	$1, (%rax), %xmm2
-- 
cgit v1.2.3


From ecb34599bdadfb74ee22529ad150e7500dd22641 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 15:20:16 +0100
Subject: [X86] Add missing immediate qualifier to the (V)ROUND instructions
 (#87636)

Makes it easier to algorithmically recreate the instruction name in various analysis scripts I'm working on
---
 llvm/lib/Target/X86/X86InstrInfo.cpp               |  32 +++---
 llvm/lib/Target/X86/X86InstrSSE.td                 | 116 ++++++++++-----------
 llvm/lib/Target/X86/X86SchedSapphireRapids.td      |  14 +--
 llvm/test/TableGen/x86-fold-tables.inc             |  28 ++---
 .../utils/TableGen/X86ManualCompressEVEXTables.def |  16 +--
 5 files changed, 103 insertions(+), 103 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index f24334312c11..a5b2e4895ede 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6276,10 +6276,10 @@ static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
   case X86::RCPSSm:
   case X86::RCPSSr_Int:
   case X86::RCPSSm_Int:
-  case X86::ROUNDSDr:
-  case X86::ROUNDSDm:
-  case X86::ROUNDSSr:
-  case X86::ROUNDSSm:
+  case X86::ROUNDSDri:
+  case X86::ROUNDSDmi:
+  case X86::ROUNDSSri:
+  case X86::ROUNDSSmi:
   case X86::RSQRTSSr:
   case X86::RSQRTSSm:
   case X86::RSQRTSSr_Int:
@@ -6778,14 +6778,14 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
   case X86::VRCPSSr_Int:
   case X86::VRCPSSm:
   case X86::VRCPSSm_Int:
-  case X86::VROUNDSDr:
-  case X86::VROUNDSDm:
-  case X86::VROUNDSDr_Int:
-  case X86::VROUNDSDm_Int:
-  case X86::VROUNDSSr:
-  case X86::VROUNDSSm:
-  case X86::VROUNDSSr_Int:
-  case X86::VROUNDSSm_Int:
+  case X86::VROUNDSDri:
+  case X86::VROUNDSDmi:
+  case X86::VROUNDSDri_Int:
+  case X86::VROUNDSDmi_Int:
+  case X86::VROUNDSSri:
+  case X86::VROUNDSSmi:
+  case X86::VROUNDSSri_Int:
+  case X86::VROUNDSSmi_Int:
   case X86::VRSQRTSSr:
   case X86::VRSQRTSSr_Int:
   case X86::VRSQRTSSm:
@@ -7516,8 +7516,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VRCPSSr_Int:
     case X86::RSQRTSSr_Int:
     case X86::VRSQRTSSr_Int:
-    case X86::ROUNDSSr_Int:
-    case X86::VROUNDSSr_Int:
+    case X86::ROUNDSSri_Int:
+    case X86::VROUNDSSri_Int:
     case X86::COMISSrr_Int:
     case X86::VCOMISSrr_Int:
     case X86::VCOMISSZrr_Int:
@@ -7685,8 +7685,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VCVTSD2USI64Zrr_Int:
     case X86::VCVTTSD2USIZrr_Int:
     case X86::VCVTTSD2USI64Zrr_Int:
-    case X86::ROUNDSDr_Int:
-    case X86::VROUNDSDr_Int:
+    case X86::ROUNDSDri_Int:
+    case X86::VROUNDSDri_Int:
     case X86::COMISDrr_Int:
     case X86::VCOMISDrr_Int:
     case X86::VCOMISDZrr_Int:
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 69d45366a1db..2b391b60f2c9 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5475,35 +5475,35 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
-  def r : SS4AIi8<opc, MRMSrcReg,
-                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
-                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
-                  Sched<[sched]>;
+  def ri : SS4AIi8<opc, MRMSrcReg,
+                   (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
+                   Sched<[sched]>;
 
   // Vector intrinsic operation, mem
-  def m : SS4AIi8<opc, MRMSrcMem,
-                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
-                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst,
-                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
-                  Sched<[sched.Folded]>;
+  def mi : SS4AIi8<opc, MRMSrcMem,
+                   (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                         (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
+                   Sched<[sched.Folded]>;
 }
 }
 
 multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
                           string OpcodeStr, X86FoldableSchedWrite sched> {
 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SSr : SS4AIi8<opcss, MRMSrcReg,
+  def SSri : SS4AIi8<opcss, MRMSrcReg,
         (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
       []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SSm : SS4AIi8<opcss, MRMSrcMem,
+  def SSmi : SS4AIi8<opcss, MRMSrcMem,
         (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -5511,14 +5511,14 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
 
 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+  def SDri : SS4AIi8<opcsd, MRMSrcReg,
         (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
               "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+  def SDmi : SS4AIi8<opcsd, MRMSrcMem,
         (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
         !strconcat(OpcodeStr,
              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -5530,44 +5530,44 @@ multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
                            string OpcodeStr, X86FoldableSchedWrite sched> {
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SSr : SS4AIi8<opcss, MRMSrcReg,
-                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched]>;
+  def SSri : SS4AIi8<opcss, MRMSrcReg,
+                     (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SSm : SS4AIi8<opcss, MRMSrcMem,
-                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  def SSmi : SS4AIi8<opcss, MRMSrcMem,
+                     (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedSingle, hasSideEffects = 0
 
 let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
-  def SDr : SS4AIi8<opcsd, MRMSrcReg,
-                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched]>;
+  def SDri : SS4AIi8<opcsd, MRMSrcReg,
+                     (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched]>;
 
   let mayLoad = 1 in
-  def SDm : SS4AIi8<opcsd, MRMSrcMem,
-                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
-                    !strconcat(OpcodeStr,
-                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  def SDmi : SS4AIi8<opcsd, MRMSrcMem,
+                     (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+                     !strconcat(OpcodeStr,
+                                "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedDouble, hasSideEffects = 0
 }
 }
 
-multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
-                            string OpcodeStr, X86FoldableSchedWrite sched,
-                            ValueType VT32, ValueType VT64,
-                            SDNode OpNode, bit Is2Addr = 1> {
+multiclass sse41_fp_unop_s_int<bits<8> opcss, bits<8> opcsd,
+                               string OpcodeStr, X86FoldableSchedWrite sched,
+                               ValueType VT32, ValueType VT64,
+                               SDNode OpNode, bit Is2Addr = 1> {
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let ExeDomain = SSEPackedSingle in {
-  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+  def SSri_Int : SS4AIi8<opcss, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5577,7 +5577,7 @@ let ExeDomain = SSEPackedSingle in {
         [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
         Sched<[sched]>;
 
-  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+  def SSmi_Int : SS4AIi8<opcss, MRMSrcMem,
         (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5590,7 +5590,7 @@ let ExeDomain = SSEPackedSingle in {
 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
 
 let ExeDomain = SSEPackedDouble in {
-  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+  def SDri_Int : SS4AIi8<opcsd, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5600,7 +5600,7 @@ let ExeDomain = SSEPackedDouble in {
         [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
         Sched<[sched]>;
 
-  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+  def SDmi_Int : SS4AIi8<opcsd, MRMSrcMem,
         (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
@@ -5636,25 +5636,25 @@ let Predicates = [HasAVX, NoVLX] in {
   }
 }
 let Predicates = [UseAVX] in {
-  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
-                                  v4f32, v2f64, X86RndScales, 0>,
-                                  VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
+  defm VROUND  : sse41_fp_unop_s_int<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
+                                     v4f32, v2f64, X86RndScales, 0>,
+                                     VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
                                 VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
 }
 
 let Predicates = [UseAVX] in {
   def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
-            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
+            (VROUNDSSri (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
-            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
+            (VROUNDSDri (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
 }
 
 let Predicates = [UseAVX, OptForSize] in {
   def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
-            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+            (VROUNDSSmi (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
-            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+            (VROUNDSDmi (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -5667,21 +5667,21 @@ defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
 defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
 
 let Constraints = "$src1 = $dst" in
-defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
-                               v4f32, v2f64, X86RndScales>;
+defm ROUND  : sse41_fp_unop_s_int<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
+                                  v4f32, v2f64, X86RndScales>;
 
 let Predicates = [UseSSE41] in {
   def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
-            (ROUNDSSr FR32:$src1, timm:$src2)>;
+            (ROUNDSSri FR32:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
-            (ROUNDSDr FR64:$src1, timm:$src2)>;
+            (ROUNDSDri FR64:$src1, timm:$src2)>;
 }
 
 let Predicates = [UseSSE41, OptForSize] in {
   def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
-            (ROUNDSSm addr:$src1, timm:$src2)>;
+            (ROUNDSSmi addr:$src1, timm:$src2)>;
   def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
-            (ROUNDSDm addr:$src1, timm:$src2)>;
+            (ROUNDSDmi addr:$src1, timm:$src2)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
index 88bb9ad8f1d7..ff3fe32be185 100644
--- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td
+++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
@@ -2290,8 +2290,8 @@ def SPRWriteResGroup218 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_11]> {
   let Latency = 15;
   let NumMicroOps = 3;
 }
-def : InstRW<[SPRWriteResGroup218], (instregex "^(V?)ROUNDP(D|S)m$")>;
-def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S)m((_Int)?)$",
+def : InstRW<[SPRWriteResGroup218], (instregex "^(V?)ROUNDP(D|S)mi$")>;
+def : InstRW<[SPRWriteResGroup218, ReadAfterVecXLd], (instregex "^(V?)ROUNDS(D|S)mi((_Int)?)$",
                                                                 "^VRNDSCALEP(D|S)Z128rm(bi|ik)$",
                                                                 "^VRNDSCALEP(D|S)Z128rmbik(z?)$",
                                                                 "^VRNDSCALEP(D|S)Z128rmi((kz)?)$",
@@ -2303,13 +2303,13 @@ def SPRWriteResGroup219 : SchedWriteRes<[SPRPort00_01]> {
   let Latency = 8;
   let NumMicroOps = 2;
 }
-def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)r$",
-                                               "^(V?)ROUND(PS|SD)r$",
-                                               "^(V?)ROUNDS(D|S)r_Int$",
+def : InstRW<[SPRWriteResGroup219], (instregex "^(V?)ROUND(PD|SS)ri$",
+                                               "^(V?)ROUND(PS|SD)ri$",
+                                               "^(V?)ROUNDS(D|S)ri_Int$",
                                                "^VRNDSCALEP(D|S)Z(128|256)rri((k|kz)?)$",
                                                "^VRNDSCALES(D|S)Zr$",
                                                "^VRNDSCALES(D|S)Zr(b?)_Int((k|kz)?)$",
-                                               "^VROUNDP(D|S)Yr$")>;
+                                               "^VROUNDP(D|S)Yri$")>;
 
 def SPRWriteResGroup220 : SchedWriteRes<[SPRPort00_06]> {
   let ReleaseAtCycles = [2];
@@ -3737,7 +3737,7 @@ def SPRWriteResGroup390 : SchedWriteRes<[SPRPort00_01, SPRPort02_03_11]> {
   let NumMicroOps = 3;
 }
 def : InstRW<[SPRWriteResGroup390], (instregex "^VF(C?)MADDCPHZ(128|256)m(b?)$",
-                                               "^VROUNDP(D|S)Ym$")>;
+                                               "^VROUNDP(D|S)Ymi$")>;
 def : InstRW<[SPRWriteResGroup390, ReadAfterVecXLd], (instregex "^VF(C?)MADDCSHZm$",
                                                                 "^VF(C?)MULCPHZ128rm(b?)$",
                                                                 "^VF(C?)MULCSHZrm$",
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 4ab5567f6287..493350d7bd63 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -984,10 +984,10 @@ static const X86FoldTableEntry Table1[] = {
   {X86::RORX32ri_EVEX, X86::RORX32mi_EVEX, 0},
   {X86::RORX64ri, X86::RORX64mi, 0},
   {X86::RORX64ri_EVEX, X86::RORX64mi_EVEX, 0},
-  {X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16},
-  {X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16},
-  {X86::ROUNDSDr, X86::ROUNDSDm, 0},
-  {X86::ROUNDSSr, X86::ROUNDSSm, 0},
+  {X86::ROUNDPDri, X86::ROUNDPDmi, TB_ALIGN_16},
+  {X86::ROUNDPSri, X86::ROUNDPSmi, TB_ALIGN_16},
+  {X86::ROUNDSDri, X86::ROUNDSDmi, 0},
+  {X86::ROUNDSSri, X86::ROUNDSSmi, 0},
   {X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16},
   {X86::RSQRTSSr, X86::RSQRTSSm, 0},
   {X86::SAR16r1_ND, X86::SAR16m1_ND, 0},
@@ -1791,10 +1791,10 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VRNDSCALEPSZ128rri, X86::VRNDSCALEPSZ128rmi, 0},
   {X86::VRNDSCALEPSZ256rri, X86::VRNDSCALEPSZ256rmi, 0},
   {X86::VRNDSCALEPSZrri, X86::VRNDSCALEPSZrmi, 0},
-  {X86::VROUNDPDYr, X86::VROUNDPDYm, 0},
-  {X86::VROUNDPDr, X86::VROUNDPDm, 0},
-  {X86::VROUNDPSYr, X86::VROUNDPSYm, 0},
-  {X86::VROUNDPSr, X86::VROUNDPSm, 0},
+  {X86::VROUNDPDYri, X86::VROUNDPDYmi, 0},
+  {X86::VROUNDPDri, X86::VROUNDPDmi, 0},
+  {X86::VROUNDPSYri, X86::VROUNDPSYmi, 0},
+  {X86::VROUNDPSri, X86::VROUNDPSmi, 0},
   {X86::VRSQRT14PDZ128r, X86::VRSQRT14PDZ128m, 0},
   {X86::VRSQRT14PDZ256r, X86::VRSQRT14PDZ256m, 0},
   {X86::VRSQRT14PDZr, X86::VRSQRT14PDZm, 0},
@@ -2234,8 +2234,8 @@ static const X86FoldTableEntry Table2[] = {
   {X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16},
   {X86::PXORrr, X86::PXORrm, TB_ALIGN_16},
   {X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE},
-  {X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE},
-  {X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE},
+  {X86::ROUNDSDri_Int, X86::ROUNDSDmi_Int, TB_NO_REVERSE},
+  {X86::ROUNDSSri_Int, X86::ROUNDSSmi_Int, TB_NO_REVERSE},
   {X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE},
   {X86::SBB16rr, X86::SBB16rm, 0},
   {X86::SBB16rr_ND, X86::SBB16rm_ND, 0},
@@ -3778,10 +3778,10 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VRNDSCALESHZr_Int, X86::VRNDSCALESHZm_Int, TB_NO_REVERSE},
   {X86::VRNDSCALESSZr, X86::VRNDSCALESSZm, 0},
   {X86::VRNDSCALESSZr_Int, X86::VRNDSCALESSZm_Int, TB_NO_REVERSE},
-  {X86::VROUNDSDr, X86::VROUNDSDm, 0},
-  {X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE},
-  {X86::VROUNDSSr, X86::VROUNDSSm, 0},
-  {X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE},
+  {X86::VROUNDSDri, X86::VROUNDSDmi, 0},
+  {X86::VROUNDSDri_Int, X86::VROUNDSDmi_Int, TB_NO_REVERSE},
+  {X86::VROUNDSSri, X86::VROUNDSSmi, 0},
+  {X86::VROUNDSSri_Int, X86::VROUNDSSmi_Int, TB_NO_REVERSE},
   {X86::VRSQRT14PDZ128rkz, X86::VRSQRT14PDZ128mkz, 0},
   {X86::VRSQRT14PDZ256rkz, X86::VRSQRT14PDZ256mkz, 0},
   {X86::VRSQRT14PDZrkz, X86::VRSQRT14PDZmkz, 0},
diff --git a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
index 77cf65be6842..665a394f57a6 100644
--- a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
+++ b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
@@ -197,12 +197,12 @@ ENTRY(VPUNPCKLQDQZ128rm, VPUNPCKLQDQrm)
 ENTRY(VPUNPCKLQDQZ128rr, VPUNPCKLQDQrr)
 ENTRY(VPXORQZ128rm, VPXORrm)
 ENTRY(VPXORQZ128rr, VPXORrr)
-ENTRY(VRNDSCALEPDZ128rmi, VROUNDPDm)
-ENTRY(VRNDSCALEPDZ128rri, VROUNDPDr)
-ENTRY(VRNDSCALESDZm, VROUNDSDm)
-ENTRY(VRNDSCALESDZm_Int, VROUNDSDm_Int)
-ENTRY(VRNDSCALESDZr, VROUNDSDr)
-ENTRY(VRNDSCALESDZr_Int, VROUNDSDr_Int)
+ENTRY(VRNDSCALEPDZ128rmi, VROUNDPDmi)
+ENTRY(VRNDSCALEPDZ128rri, VROUNDPDri)
+ENTRY(VRNDSCALESDZm, VROUNDSDmi)
+ENTRY(VRNDSCALESDZm_Int, VROUNDSDmi_Int)
+ENTRY(VRNDSCALESDZr, VROUNDSDri)
+ENTRY(VRNDSCALESDZr_Int, VROUNDSDri_Int)
 ENTRY(VSHUFPDZ128rmi, VSHUFPDrmi)
 ENTRY(VSHUFPDZ128rri, VSHUFPDrri)
 ENTRY(VSQRTPDZ128m, VSQRTPDm)
@@ -306,8 +306,8 @@ ENTRY(VPUNPCKLQDQZ256rm, VPUNPCKLQDQYrm)
 ENTRY(VPUNPCKLQDQZ256rr, VPUNPCKLQDQYrr)
 ENTRY(VPXORQZ256rm, VPXORYrm)
 ENTRY(VPXORQZ256rr, VPXORYrr)
-ENTRY(VRNDSCALEPDZ256rmi, VROUNDPDYm)
-ENTRY(VRNDSCALEPDZ256rri, VROUNDPDYr)
+ENTRY(VRNDSCALEPDZ256rmi, VROUNDPDYmi)
+ENTRY(VRNDSCALEPDZ256rri, VROUNDPDYri)
 ENTRY(VSHUFPDZ256rmi, VSHUFPDYrmi)
 ENTRY(VSHUFPDZ256rri, VSHUFPDYrri)
 ENTRY(VSQRTPDZ256m, VSQRTPDYm)
-- 
cgit v1.2.3


From c1742525d0126a6124d15512b7283c4e37c7c186 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 15:42:29 +0100
Subject: [X86] evex-to-vex-compress.mir - update test checks missed in #87636

---
 llvm/test/CodeGen/X86/evex-to-vex-compress.mir | 32 +++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
index 548cf24b9200..13c958587327 100644
--- a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
+++ b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
@@ -869,13 +869,13 @@ body: |
   $ymm0 = VSHUFPSZ256rmi                       $ymm0, $rdi, 1, $noreg, 0, $noreg, -24
   ; CHECK: $ymm0 = VSHUFPSYrri                 $ymm0, $ymm1, -24
   $ymm0 = VSHUFPSZ256rri                       $ymm0, $ymm1, -24
-  ; CHECK: $ymm0 = VROUNDPDYm                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPDYmi                 $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPDZ256rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPDYr                  $ymm0, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPDYri                 $ymm0, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPDZ256rri                   $ymm0, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPSYm                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPSYmi                 $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPSZ256rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPSYr                  $ymm0, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPSYri                 $ymm0, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPSZ256rri                   $ymm0, 15, implicit $mxcsr
   ; CHECK: $ymm0 = VPERM2F128rm                $ymm0, $rip, 1, $noreg, 0, $noreg, 32
   $ymm0 = VSHUFF32X4Z256rmi                    $ymm0, $rip, 1, $noreg, 0, $noreg, 228
@@ -1751,13 +1751,13 @@ body: |
   $xmm0 = VALIGNQZ128rmi                       $xmm0, $rip, 1, $noreg, 0, $noreg, 1
   ; CHECK: $xmm0 = VPALIGNRrri                 $xmm0, $xmm1, 8
   $xmm0 = VALIGNQZ128rri                       $xmm0, $xmm1, 1
-  ; CHECK: $xmm0 = VROUNDPDm                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPDmi                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPDZ128rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPDr                   $xmm0, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPDri                  $xmm0, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPDZ128rri                   $xmm0, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPSm                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPSmi                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPSZ128rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPSr                   $xmm0, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPSri                  $xmm0, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPSZ128rri                   $xmm0, 15, implicit $mxcsr
 
   RET64
@@ -2308,21 +2308,21 @@ body: |
   $xmm0 = VINSERTPSZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg, 1
   ; CHECK: $xmm0 = VINSERTPSrr                 $xmm0, $xmm0, 1
   $xmm0 = VINSERTPSZrr                         $xmm0, $xmm0, 1
-  ; CHECK: $xmm0 = VROUNDSDm                   $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDmi                  $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZm                        $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDr                   $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDri                  $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZr                        $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSm                   $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSmi                  $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZm                        $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSr                   $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSri                  $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZr                        $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDm_Int               $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDmi_Int              $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZm_Int                    $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDr_Int               $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDri_Int              $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZr_Int                    $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSm_Int               $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSmi_Int              $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZm_Int                    $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSr_Int               $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSri_Int              $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZr_Int                    $xmm0, $xmm1, 15, implicit $mxcsr
 
   RET64
-- 
cgit v1.2.3


From 8ebf7b7d7aed45889415669e0c7353f9b528161a Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Thu, 4 Apr 2024 07:46:55 -0700
Subject: [InstallAPI][Test] Add test for invalid verification mode (#87602)

---
 clang/test/InstallAPI/driver-invalid-options.test | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/clang/test/InstallAPI/driver-invalid-options.test b/clang/test/InstallAPI/driver-invalid-options.test
index 69f3b2d66ab8..0c630eacd183 100644
--- a/clang/test/InstallAPI/driver-invalid-options.test
+++ b/clang/test/InstallAPI/driver-invalid-options.test
@@ -7,3 +7,9 @@
 // RUN: not clang-installapi -target x86_64-apple-ios-simulator  %s -o tmp.tbd 2> %t 
 // RUN: FileCheck --check-prefix INVALID_INSTALL_NAME -input-file %t %s
 // INVALID_INSTALL_NAME: error: no install name specified: add -install_name <path>
+
+/// Check invalid verification mode.
+// RUN: not clang-installapi -install_name Foo -target arm64-apple-ios13 \
+// RUN: --verify-mode=Invalid -o tmp.tbd 2> %t
+// RUN: FileCheck --check-prefix INVALID_VERIFY_MODE -input-file %t %s 
+// INVALID_VERIFY_MODE: error: invalid value 'Invalid' in '--verify-mode=Invalid'
-- 
cgit v1.2.3


From 5b59ae423a9e86beddafb868b9d549b2f18825ab Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak@amd.com>
Date: Thu, 4 Apr 2024 16:47:25 +0200
Subject: [DAG] Preserve NUW when reassociating (#87621)

Similarly to the generic case below, preserve the NUW flag when
reassociating adds with constants.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |    11 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll                   |    73 +-
 llvm/test/CodeGen/AMDGPU/function-returns.ll       |    87 +-
 .../CodeGen/AMDGPU/gfx-callable-return-types.ll    |   582 +-
 llvm/test/CodeGen/WebAssembly/multi-return.ll      |    72 +-
 llvm/test/CodeGen/WebAssembly/simd-arith.ll        | 13138 ++++++++-----------
 llvm/test/CodeGen/WebAssembly/simd.ll              |   408 +-
 7 files changed, 6077 insertions(+), 8294 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0a473180538a..f20080c8f108 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1164,19 +1164,20 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
   SDValue N01 = N0.getOperand(1);
 
   if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
+    SDNodeFlags NewFlags;
+    if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
+        Flags.hasNoUnsignedWrap())
+      NewFlags.setNoUnsignedWrap(true);
+
     if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
       // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
       if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
-        return DAG.getNode(Opc, DL, VT, N00, OpNode);
+        return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags);
       return SDValue();
     }
     if (TLI.isReassocProfitable(DAG, N0, N1)) {
       // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
       //              iff (op x, c1) has one use
-      SDNodeFlags NewFlags;
-      if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
-          Flags.hasNoUnsignedWrap())
-        NewFlags.setNoUnsignedWrap(true);
       SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1, NewFlags);
       return DAG.getNode(Opc, DL, VT, OpNode, N01, NewFlags);
     }
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 98658834e897..bf4302c156d8 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -5678,22 +5678,18 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    scratch_store_b128 off, v[18:21], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[10:13], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[6:9], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[2:5], s0
-; GFX11-NEXT:    scratch_store_b16 off, v1, s0 offset:128
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    scratch_store_b128 v0, v[22:25], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[18:21], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[14:17], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[10:13], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[6:9], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[2:5], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[30:33], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[26:29], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[22:25], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[14:17], s0
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b128 v0, v[30:33], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[26:29], off offset:96
+; GFX11-NEXT:    scratch_store_b16 v0, v1, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
   %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
@@ -8827,19 +8823,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_u16 v32, v[1:2], off offset:54
 ; GFX11-NEXT:    global_load_u16 v33, v[1:2], off offset:58
 ; GFX11-NEXT:    global_load_u16 v1, v[1:2], off offset:62
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s9, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s10, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s11, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(30)
@@ -8936,23 +8919,23 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[3:4], v2
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[1:2], v37
-; GFX11-NEXT:    scratch_store_b128 off, v[96:99], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[84:87], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[80:83], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[68:71], s4
-; GFX11-NEXT:    scratch_store_b128 off, v[64:67], s5
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s6
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s7
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s9
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s10
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s11
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_store_b128 v0, v[96:99], off offset:240
+; GFX11-NEXT:    scratch_store_b128 v0, v[84:87], off offset:224
+; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off offset:208
+; GFX11-NEXT:    scratch_store_b128 v0, v[68:71], off offset:192
+; GFX11-NEXT:    scratch_store_b128 v0, v[64:67], off offset:176
+; GFX11-NEXT:    scratch_store_b128 v0, v[52:55], off offset:160
+; GFX11-NEXT:    scratch_store_b128 v0, v[48:51], off offset:144
+; GFX11-NEXT:    scratch_store_b128 v0, v[33:36], off offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <32 x bfloat> %load to <32 x double>
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index acadee279817..401cbce00ac9 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1561,34 +1561,28 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:96
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <33 x i32>, ptr addrspace(1) %ptr
@@ -1850,34 +1844,28 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:96
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
@@ -2143,33 +2131,24 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144
 ; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index c1d682689903..3b078c41f4a8 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1989,256 +1989,138 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x7
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:1024
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:512
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:256
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x790
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x780
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x770
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x760
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x750
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x740
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x730
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x720
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x710
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x700
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x690
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x680
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x670
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x660
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x650
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x640
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x630
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x620
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x610
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x600
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x590
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x580
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x570
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x560
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x550
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x540
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x530
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x520
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x510
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x500
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x490
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x480
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x470
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x460
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x450
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x440
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x430
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x420
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x410
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3f0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3d0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3b0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x390
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x380
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x370
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x360
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x350
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x340
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x330
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x320
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x310
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x300
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2f0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2d0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2b0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x290
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x280
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x270
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x260
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x250
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x240
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x230
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x220
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x210
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x190
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x180
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x170
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x160
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x150
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x140
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x130
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x120
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xf0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xd0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xb0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x90
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2032
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2016
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2000
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1984
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1968
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1952
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1936
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1920
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1904
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1888
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1872
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1856
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1840
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1824
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1808
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1792
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1776
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1760
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1744
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1728
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1712
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1696
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1680
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1664
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1648
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1632
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1616
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1600
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1584
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1568
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1552
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1536
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1520
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1504
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1488
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1472
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1456
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1440
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1424
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1408
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1392
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1376
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1360
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1344
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1328
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1312
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1296
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1280
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1264
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1248
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1232
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1216
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1200
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1184
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1168
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1152
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1136
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1120
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1104
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1088
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1072
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1056
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1040
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1024
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1008
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:992
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:976
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:960
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:944
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:928
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:912
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:896
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:880
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:864
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:848
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:832
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:816
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:800
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:784
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:768
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:752
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:736
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:720
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:704
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:688
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:672
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:656
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:640
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:624
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:608
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:592
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:576
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:560
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:544
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:528
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:512
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:496
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:480
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:464
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:448
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:432
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:416
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:400
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:384
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:368
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:352
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:336
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:320
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:304
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:288
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:272
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:256
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:224
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:208
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:192
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:176
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:160
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:144
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <512 x i32> zeroinitializer
@@ -2636,7 +2518,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:212
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:208
@@ -2651,93 +2532,82 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:168
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:164
-; GFX11-NEXT:    s_clause 0x14
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:120
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
+; GFX11-NEXT:    s_clause 0x11
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v23, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v22, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v21, off, s32 offset:104
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:136
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
+; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v11, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v10, off, s32 offset:152
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
+; GFX11-NEXT:    scratch_load_b32 v15, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_b32 v14, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_b32 v13, off, s32 offset:136
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-NEXT:    s_clause 0xd
-; GFX11-NEXT:    scratch_load_b32 v8, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v7, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v6, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v5, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v9, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-NEXT:    scratch_load_b32 v16, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x100
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s34, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s35, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s36, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s37, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s38, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s39, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s40, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s41, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s42, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s43, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[60:63], off offset:272
 ; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[12:15], off offset:256
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[16:19], off offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[60:63], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[20:23], off offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s34
+; GFX11-NEXT:    scratch_store_b128 v0, v[56:59], off offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s35
+; GFX11-NEXT:    scratch_store_b128 v0, v[41:44], off offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[37:40], s36
+; GFX11-NEXT:    scratch_store_b128 v0, v[37:40], off offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s37
+; GFX11-NEXT:    scratch_store_b128 v0, v[52:55], off offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s38
+; GFX11-NEXT:    scratch_store_b128 v0, v[48:51], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s39
+; GFX11-NEXT:    scratch_store_b128 v0, v[33:36], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s40
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s41
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s42
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s43
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
@@ -3306,7 +3176,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-LABEL: call_72xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s46, s33
+; GFX11-NEXT:    s_mov_b32 s34, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
@@ -3353,11 +3223,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    s_add_i32 s0, s32, 32
 ; GFX11-NEXT:    s_add_i32 s1, s32, 16
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x200
+; GFX11-NEXT:    v_writelane_b32 v60, s30, 0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
-; GFX11-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0
@@ -3373,14 +3243,14 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
-; GFX11-NEXT:    s_mov_b32 s45, return_72xi32@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s44, return_72xi32@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s1, return_72xi32@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, return_72xi32@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v60, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b128 v[45:48], off, s33 offset:624
 ; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:640
-; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    s_add_i32 s2, s32, 0xa0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v32, v48
 ; GFX11-NEXT:    s_clause 0x9
@@ -3431,38 +3301,38 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9
 ; GFX11-NEXT:    v_mov_b32_e32 v9, v20
-; GFX11-NEXT:    scratch_store_b32 off, v11, s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
+; GFX11-NEXT:    scratch_store_b32 off, v11, s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x90
 ; GFX11-NEXT:    v_mov_b32_e32 v11, v22
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x80
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v16
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 24
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x70
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v17
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s2
 ; GFX11-NEXT:    v_mov_b32_e32 v13, v24
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x6c
 ; GFX11-NEXT:    v_mov_b32_e32 v7, v18
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
+; GFX11-NEXT:    scratch_store_b32 off, v0, s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x60
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26
-; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
+; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x50
 ; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
-; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 64
 ; GFX11-NEXT:    v_mov_b32_e32 v14, v25
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 48
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 48
 ; GFX11-NEXT:    v_mov_b32_e32 v16, v27
-; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 32
+; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 32
 ; GFX11-NEXT:    v_mov_b32_e32 v30, v46
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
-; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 16
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s2
 ; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 42
@@ -3470,10 +3340,10 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1572
 ; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1556
 ; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1540
-; GFX11-NEXT:    s_add_i32 s0, s33, 0x400
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0xb
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:4
@@ -3493,7 +3363,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
-; GFX11-NEXT:    s_mov_b32 s33, s46
+; GFX11-NEXT:    s_mov_b32 s33, s34
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/WebAssembly/multi-return.ll b/llvm/test/CodeGen/WebAssembly/multi-return.ll
index 3429cd512a46..293a1b35c39c 100644
--- a/llvm/test/CodeGen/WebAssembly/multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/multi-return.ll
@@ -78,18 +78,16 @@ define i64 @test4() {
 define { i64, i128 } @test5() {
 ; CHECK-LABEL: test5:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push8=, 8
-; CHECK: i32.add 	$push9=, $[[SP:[0-9]+]], $pop8
-; CHECK: i32.const	$push0=, 16
-; CHECK: i32.add 	$push1=, $pop9, $pop0
+; CHECK: i32.const	$push0=, 24
+; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 8($[[SP]])
 ; CHECK: i64.load	$push2=, 16($[[SP]])
 ; CHECK: i64.store	8($0), $pop2
+; CHECK: i64.store	16($0), $[[L1]]
 ; CHECK: i64.store	0($0), $[[L2]]
-; CHECK: i32.const	$push12=, 16
-; CHECK: i32.add 	$push3=, $0, $pop12
-; CHECK: i64.store	0($pop3), $[[L1]]
+; CHECK: i32.const	$push5=, 80
+; CHECK: i32.add 	$push6=, $3, $pop5
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
@@ -101,20 +99,20 @@ define { i64, i128 } @test5() {
 define { i128, i128 } @test6() {
 ; CHECK-LABEL: test6:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push0=, 64
+; CHECK: i32.const	$push0=, 24
 ; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i32.const	$push2=, 24
+; CHECK: i32.const	$push2=, 64
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
 ; CHECK: i64.load	$[[L3:[0-9]+]]=, 16($[[SP]])
 ; CHECK: i64.load	$push4=, 56($[[SP]])
 ; CHECK: i64.store	16($0), $pop4
+; CHECK: i64.store	24($0), $[[L2]]
 ; CHECK: i64.store	0($0), $[[L3]]
-; CHECK: i64.store	8($0), $[[L2]]
-; CHECK: i32.const	$push5=, 24
-; CHECK: i32.add 	$push6=, $0, $pop5
-; CHECK: i64.store	0($pop6), $[[L1]]
+; CHECK: i64.store	8($0), $[[L1]]
+; CHECK: i32.const	$push7=, 80
+; CHECK: i32.add	$push8=, $4, $pop7
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
   %r3 = extractvalue { i64, i128, i192, i128, i64 } %t0, 3
@@ -129,19 +127,17 @@ define { i64, i192 } @test7() {
 ; CHECK: i32.const	$push0=, 40
 ; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
+; CHECK: i64.load	$[[L2:[0-9]+]]=, 8($[[SP]])
+; CHECK: i64.load	$[[L3:[0-9]+]]=, 32($[[SP]])
 ; CHECK: i32.const	$push2=, 48
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
-; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
-; CHECK: i64.load	$[[L3:[0-9]+]]=, 8($[[SP]])
-; CHECK: i64.load	$push4=, 32($[[SP]])
-; CHECK: i64.store	8($0), $pop4
-; CHECK: i64.store	0($0), $[[L3]]
-; CHECK: i32.const	$push5=, 24
-; CHECK: i32.add 	$push6=, $0, $pop5
-; CHECK: i64.store	0($pop6), $[[L2]]
-; CHECK: i32.const	$push7=, 16
-; CHECK: i32.add 	$push8=, $0, $pop7
-; CHECK: i64.store	0($pop8), $[[L1]]
+; CHECK: i64.load	$push4=, 0($pop3)
+; CHECK: i64.store	24($0), $pop4
+; CHECK: i64.store	8($0), $[[L3]]
+; CHECK: i64.store	16($0), $[[L1]]
+; CHECK: i64.store	0($0), $[[L2]]
+; CHECK: i32.const	$push7=, 80
+; CHECK: i32.add 	$push8=, $4, $pop7
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r2 = extractvalue { i64, i128, i192, i128, i64 } %t0, 2
@@ -153,18 +149,16 @@ define { i64, i192 } @test7() {
 define { i128, i192, i128, i64 } @test8() {
 ; CHECK-LABEL: test8:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push18=, 8
-; CHECK: i32.add 	$push19=, $[[SP:[0-9]+]], $pop18
-; CHECK: i32.const	$push0=, 32
-; CHECK: i32.add 	$push1=, $pop19, $pop0
+; CHECK: i32.const	$push0=, 64
+; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i32.const	$push2=, 48
+; CHECK: i32.const	$push2=, 40
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
-; CHECK: i32.const	$push4=, 24
+; CHECK: i32.const	$push4=, 48
 ; CHECK: i32.add 	$push5=, $[[SP]], $pop4
 ; CHECK: i64.load	$[[L3:[0-9]+]]=, 0($pop5)
-; CHECK: i32.const	$push6=, 64
+; CHECK: i32.const	$push6=, 24
 ; CHECK: i32.add 	$push7=, $[[SP]], $pop6
 ; CHECK: i64.load	$[[L4:[0-9]+]]=, 0($pop7)
 ; CHECK: i64.load	$[[L5:[0-9]+]]=, 8($[[SP]])
@@ -172,19 +166,15 @@ define { i128, i192, i128, i64 } @test8() {
 ; CHECK: i64.load	$[[L7:[0-9]+]]=, 32($[[SP]])
 ; CHECK: i64.load	$push8=, 16($[[SP]])
 ; CHECK: i64.store	40($0), $pop8
+; CHECK: i64.store	48($0), $[[L4]]
+; CHECK: i64.store	32($0), $[[L3]]
 ; CHECK: i64.store	16($0), $[[L7]]
+; CHECK: i64.store	24($0), $[[L2]]
 ; CHECK: i64.store	0($0), $[[L6]]
-; CHECK: i64.store	8($0), $[[L4]]
+; CHECK: i64.store	8($0), $[[L1]]
 ; CHECK: i64.store	56($0), $[[L5]]
-; CHECK: i32.const	$push9=, 48
-; CHECK: i32.add 	$push10=, $0, $pop9
-; CHECK: i64.store	0($pop10), $[[L3]]
-; CHECK: i32.const	$push22=, 32
-; CHECK: i32.add 	$push11=, $0, $pop22
-; CHECK: i64.store	0($pop11), $[[L2]]
-; CHECK: i32.const	$push12=, 24
-; CHECK: i32.add 	$push13=, $0, $pop12
-; CHECK: i64.store	0($pop13), $[[L1]]
+; CHECK: i32.const	$push11=, 80
+; CHECK: i32.add 	$push12=, $8, $pop11
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 3a806b955f85..761a75418a00 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -31,60 +31,38 @@ define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: add_v16i8:
 ; NO-SIMD128:         .functype add_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.add $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.add $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.add $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.add $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.add $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.add $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.add $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.add $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.add $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.add $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.add $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.add $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.add $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.add $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.add $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.add $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.add $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.add $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.add $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.add $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v16i8:
@@ -96,54 +74,32 @@ define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.add $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -165,60 +121,38 @@ define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: sub_v16i8:
 ; NO-SIMD128:         .functype sub_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.sub $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.sub $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.sub $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.sub $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.sub $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.sub $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.sub $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.sub $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.sub $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.sub $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.sub $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.sub $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.sub $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.sub $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.sub $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.sub $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.sub $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.sub $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.sub $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.sub $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.sub $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.sub $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.sub $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.sub $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v16i8:
@@ -230,54 +164,32 @@ define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.sub $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.sub $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.sub $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.sub $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.sub $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.sub $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.sub $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.sub $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -425,60 +337,38 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: mul_v16i8:
 ; NO-SIMD128:         .functype mul_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.mul $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.mul $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.mul $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.mul $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.mul $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.mul $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.mul $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.mul $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.mul $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.mul $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.mul $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.mul $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.mul $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.mul $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.mul $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.mul $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.mul $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.mul $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.mul $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.mul $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.mul $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.mul $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.mul $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v16i8:
@@ -490,54 +380,32 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.mul $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.mul $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.mul $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.mul $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.mul $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -559,108 +427,86 @@ define <16 x i8> @min_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: min_s_v16i8:
 ; NO-SIMD128:         .functype min_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.lt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $16, $32, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $31
-; NO-SIMD128-NEXT:    i32.lt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $15, $31, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 13
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $30
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $31
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $15, $31, $pop6
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $30
+; NO-SIMD128-NEXT:    i32.lt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $14, $30, $pop10
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $29
 ; NO-SIMD128-NEXT:    i32.lt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $14, $30, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $29
-; NO-SIMD128-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $13, $29, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push28=, 11
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $28
+; NO-SIMD128-NEXT:    i32.select $push15=, $13, $29, $pop14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $28
+; NO-SIMD128-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $12, $28, $pop18
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $27
+; NO-SIMD128-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $11, $27, $pop22
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $26
 ; NO-SIMD128-NEXT:    i32.lt_s $push26=, $pop25, $pop24
-; NO-SIMD128-NEXT:    i32.select $push27=, $12, $28, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push34=, 10
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push30=, $27
-; NO-SIMD128-NEXT:    i32.lt_s $push32=, $pop31, $pop30
-; NO-SIMD128-NEXT:    i32.select $push33=, $11, $27, $pop32
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push40=, 9
-; NO-SIMD128-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-NEXT:    i32.select $push27=, $10, $26, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $25
+; NO-SIMD128-NEXT:    i32.lt_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.select $push31=, $9, $25, $pop30
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop31
+; NO-SIMD128-NEXT:    i32.extend8_s $push33=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $24
+; NO-SIMD128-NEXT:    i32.lt_s $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.select $push35=, $8, $24, $pop34
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop35
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $23
 ; NO-SIMD128-NEXT:    i32.lt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $10, $26, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop41), $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $25
-; NO-SIMD128-NEXT:    i32.lt_s $push44=, $pop43, $pop42
-; NO-SIMD128-NEXT:    i32.select $push45=, $9, $25, $pop44
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-NEXT:    i32.const $push50=, 7
-; NO-SIMD128-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $24
-; NO-SIMD128-NEXT:    i32.lt_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.select $push49=, $8, $24, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop51), $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 6
-; NO-SIMD128-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $23
+; NO-SIMD128-NEXT:    i32.select $push39=, $7, $23, $pop38
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $22
+; NO-SIMD128-NEXT:    i32.lt_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.select $push43=, $6, $22, $pop42
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop43
+; NO-SIMD128-NEXT:    i32.extend8_s $push45=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $21
+; NO-SIMD128-NEXT:    i32.lt_s $push46=, $pop45, $pop44
+; NO-SIMD128-NEXT:    i32.select $push47=, $5, $21, $pop46
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop47
+; NO-SIMD128-NEXT:    i32.extend8_s $push49=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $20
+; NO-SIMD128-NEXT:    i32.lt_s $push50=, $pop49, $pop48
+; NO-SIMD128-NEXT:    i32.select $push51=, $4, $20, $pop50
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop51
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $19
 ; NO-SIMD128-NEXT:    i32.lt_s $push54=, $pop53, $pop52
-; NO-SIMD128-NEXT:    i32.select $push55=, $7, $23, $pop54
-; NO-SIMD128-NEXT:    i32.store8 0($pop57), $pop55
-; NO-SIMD128-NEXT:    i32.const $push62=, 5
-; NO-SIMD128-NEXT:    i32.add $push63=, $0, $pop62
-; NO-SIMD128-NEXT:    i32.extend8_s $push59=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push58=, $22
-; NO-SIMD128-NEXT:    i32.lt_s $push60=, $pop59, $pop58
-; NO-SIMD128-NEXT:    i32.select $push61=, $6, $22, $pop60
-; NO-SIMD128-NEXT:    i32.store8 0($pop63), $pop61
-; NO-SIMD128-NEXT:    i32.extend8_s $push65=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $21
-; NO-SIMD128-NEXT:    i32.lt_s $push66=, $pop65, $pop64
-; NO-SIMD128-NEXT:    i32.select $push67=, $5, $21, $pop66
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop67
-; NO-SIMD128-NEXT:    i32.const $push72=, 3
-; NO-SIMD128-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-NEXT:    i32.extend8_s $push69=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push68=, $20
-; NO-SIMD128-NEXT:    i32.lt_s $push70=, $pop69, $pop68
-; NO-SIMD128-NEXT:    i32.select $push71=, $4, $20, $pop70
-; NO-SIMD128-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-NEXT:    i32.extend8_s $push75=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push74=, $19
-; NO-SIMD128-NEXT:    i32.lt_s $push76=, $pop75, $pop74
-; NO-SIMD128-NEXT:    i32.select $push77=, $3, $19, $pop76
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop77
-; NO-SIMD128-NEXT:    i32.extend8_s $push79=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push78=, $18
-; NO-SIMD128-NEXT:    i32.lt_s $push80=, $pop79, $pop78
-; NO-SIMD128-NEXT:    i32.select $push81=, $2, $18, $pop80
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop81
-; NO-SIMD128-NEXT:    i32.extend8_s $push83=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push82=, $17
-; NO-SIMD128-NEXT:    i32.lt_s $push84=, $pop83, $pop82
-; NO-SIMD128-NEXT:    i32.select $push85=, $1, $17, $pop84
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop85
+; NO-SIMD128-NEXT:    i32.select $push55=, $3, $19, $pop54
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop55
+; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $18
+; NO-SIMD128-NEXT:    i32.lt_s $push58=, $pop57, $pop56
+; NO-SIMD128-NEXT:    i32.select $push59=, $2, $18, $pop58
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop59
+; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push60=, $17
+; NO-SIMD128-NEXT:    i32.lt_s $push62=, $pop61, $pop60
+; NO-SIMD128-NEXT:    i32.select $push63=, $1, $17, $pop62
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop63
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v16i8:
@@ -681,93 +527,71 @@ define <16 x i8> @min_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $21
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $21
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $22
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $24
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $23, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $24
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $25
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $24, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $25
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.select $push35=, $9, $25, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push38=, $pop37, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.select $push39=, $10, $26, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push42=, $pop41, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $9, $25, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $26
+; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $11, $27, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $28
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push46=, $pop45, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $10, $26, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push51=, $11
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $27
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push52=, $pop51, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.select $push53=, $11, $27, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $28
+; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $12, $28, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push48=, $29
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push50=, $pop49, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.select $push51=, $13, $29, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push52=, $30
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.select $push55=, $14, $30, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop55
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $31
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $12, $28, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push63=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push62=, $29
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push64=, $pop63, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.select $push65=, $13, $29, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push68=, $30
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.select $push71=, $14, $30, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $0, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push75=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push74=, $31
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push76=, $pop75, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.select $push77=, $15, $31, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop79), $pop77
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $0, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push81=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push80=, $32
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push82=, $pop81, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.select $push83=, $16, $32, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop85), $pop83
+; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $15, $31, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push61=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $32
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push62=, $pop61, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.select $push63=, $16, $32, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop63
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -790,140 +614,118 @@ define <16 x i8> @min_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: min_u_v16i8:
 ; NO-SIMD128:         .functype min_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
-; NO-SIMD128-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop117
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop95
 ; NO-SIMD128-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $16, $32, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop115
-; NO-SIMD128-NEXT:    i32.lt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $15, $31, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $14, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $30, $pop113
-; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $14, $30, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-NEXT:    i32.and $push20=, $13, $pop112
-; NO-SIMD128-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $29, $pop111
-; NO-SIMD128-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $13, $29, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $12, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-NEXT:    i32.and $push25=, $28, $pop109
-; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.select $push28=, $12, $28, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop107
-; NO-SIMD128-NEXT:    i32.lt_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.select $push34=, $11, $27, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-NEXT:    i32.and $push38=, $10, $pop106
-; NO-SIMD128-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $26, $pop105
-; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $10, $26, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-NEXT:    i32.and $push44=, $9, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $25, $pop103
-; NO-SIMD128-NEXT:    i32.lt_u $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.select $push46=, $9, $25, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-NEXT:    i32.and $push48=, $8, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $24, $pop101
-; NO-SIMD128-NEXT:    i32.lt_u $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.select $push50=, $8, $24, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push54=, $7, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push53=, $23, $pop99
-; NO-SIMD128-NEXT:    i32.lt_u $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.select $push56=, $7, $23, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push60=, $6, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push59=, $22, $pop97
-; NO-SIMD128-NEXT:    i32.lt_u $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.select $push62=, $6, $22, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $5, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push65=, $21, $pop95
-; NO-SIMD128-NEXT:    i32.lt_u $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.select $push68=, $5, $21, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push70=, $4, $pop94
+; NO-SIMD128-NEXT:    i32.and $push6=, $15, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push69=, $20, $pop93
-; NO-SIMD128-NEXT:    i32.lt_u $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.select $push72=, $4, $20, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.and $push5=, $31, $pop93
+; NO-SIMD128-NEXT:    i32.lt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $15, $31, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push76=, $3, $pop92
+; NO-SIMD128-NEXT:    i32.and $push10=, $14, $pop92
 ; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push75=, $19, $pop91
-; NO-SIMD128-NEXT:    i32.lt_u $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.select $push78=, $3, $19, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.and $push9=, $30, $pop91
+; NO-SIMD128-NEXT:    i32.lt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $14, $30, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push80=, $2, $pop90
+; NO-SIMD128-NEXT:    i32.and $push14=, $13, $pop90
 ; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push79=, $18, $pop89
-; NO-SIMD128-NEXT:    i32.lt_u $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.select $push82=, $2, $18, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.and $push13=, $29, $pop89
+; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $13, $29, $pop15
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
 ; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push84=, $1, $pop88
+; NO-SIMD128-NEXT:    i32.and $push18=, $12, $pop88
 ; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push83=, $17, $pop87
-; NO-SIMD128-NEXT:    i32.lt_u $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.select $push86=, $1, $17, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.and $push17=, $28, $pop87
+; NO-SIMD128-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $12, $28, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $11, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $27, $pop85
+; NO-SIMD128-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $11, $27, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $10, $pop84
+; NO-SIMD128-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $26, $pop83
+; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $10, $26, $pop27
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-NEXT:    i32.and $push30=, $9, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $25, $pop81
+; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $9, $25, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $8, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $24, $pop79
+; NO-SIMD128-NEXT:    i32.lt_u $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.select $push36=, $8, $24, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $7, $pop78
+; NO-SIMD128-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $23, $pop77
+; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $7, $23, $pop39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $6, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $22, $pop75
+; NO-SIMD128-NEXT:    i32.lt_u $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.select $push44=, $6, $22, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $5, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-NEXT:    i32.and $push45=, $21, $pop73
+; NO-SIMD128-NEXT:    i32.lt_u $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.select $push48=, $5, $21, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push50=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $20, $pop71
+; NO-SIMD128-NEXT:    i32.lt_u $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.select $push52=, $4, $20, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push54=, $3, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $19, $pop69
+; NO-SIMD128-NEXT:    i32.lt_u $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.select $push56=, $3, $19, $pop55
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $2, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $18, $pop67
+; NO-SIMD128-NEXT:    i32.lt_u $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.select $push60=, $2, $18, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push61=, $17, $pop65
+; NO-SIMD128-NEXT:    i32.lt_u $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.select $push64=, $1, $17, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v16i8:
@@ -931,138 +733,116 @@ define <16 x i8> @min_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop95
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $17, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop93
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $18, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $19, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop89
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $20, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $21, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $22, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $23, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $21, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $21, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $22, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $22, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $24, $pop81
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $23, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $24, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $24, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $9, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $25, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $24, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.select $push36=, $9, $25, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $10, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $26, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push40=, $10, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop75
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $9, $25, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $10, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $26, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $11, $27, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $12, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $28, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $10, $26, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $11, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $27, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.select $push54=, $11, $27, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $12, $28, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $13, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $29, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.select $push52=, $13, $29, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $14, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $30, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.select $push56=, $14, $30, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $15, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $31, $pop67
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $12, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $13, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $29, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.select $push66=, $13, $29, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push70=, $14, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $30, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.select $push72=, $14, $30, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push76=, $15, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $31, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.select $push78=, $15, $31, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push82=, $16, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $32, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.select $push84=, $16, $32, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $15, $31, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $16, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $32, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.select $push64=, $16, $32, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1085,108 +865,86 @@ define <16 x i8> @max_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: max_s_v16i8:
 ; NO-SIMD128:         .functype max_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.gt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $16, $32, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $31
-; NO-SIMD128-NEXT:    i32.gt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $15, $31, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 13
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $30
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $31
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $15, $31, $pop6
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $30
+; NO-SIMD128-NEXT:    i32.gt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $14, $30, $pop10
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $29
 ; NO-SIMD128-NEXT:    i32.gt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $14, $30, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $29
-; NO-SIMD128-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $13, $29, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push28=, 11
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $28
+; NO-SIMD128-NEXT:    i32.select $push15=, $13, $29, $pop14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $28
+; NO-SIMD128-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $12, $28, $pop18
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $27
+; NO-SIMD128-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $11, $27, $pop22
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $26
 ; NO-SIMD128-NEXT:    i32.gt_s $push26=, $pop25, $pop24
-; NO-SIMD128-NEXT:    i32.select $push27=, $12, $28, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push34=, 10
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push30=, $27
-; NO-SIMD128-NEXT:    i32.gt_s $push32=, $pop31, $pop30
-; NO-SIMD128-NEXT:    i32.select $push33=, $11, $27, $pop32
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push40=, 9
-; NO-SIMD128-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-NEXT:    i32.select $push27=, $10, $26, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $25
+; NO-SIMD128-NEXT:    i32.gt_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.select $push31=, $9, $25, $pop30
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop31
+; NO-SIMD128-NEXT:    i32.extend8_s $push33=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $24
+; NO-SIMD128-NEXT:    i32.gt_s $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.select $push35=, $8, $24, $pop34
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop35
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $23
 ; NO-SIMD128-NEXT:    i32.gt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $10, $26, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop41), $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $25
-; NO-SIMD128-NEXT:    i32.gt_s $push44=, $pop43, $pop42
-; NO-SIMD128-NEXT:    i32.select $push45=, $9, $25, $pop44
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-NEXT:    i32.const $push50=, 7
-; NO-SIMD128-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $24
-; NO-SIMD128-NEXT:    i32.gt_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.select $push49=, $8, $24, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop51), $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 6
-; NO-SIMD128-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $23
+; NO-SIMD128-NEXT:    i32.select $push39=, $7, $23, $pop38
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $22
+; NO-SIMD128-NEXT:    i32.gt_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.select $push43=, $6, $22, $pop42
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop43
+; NO-SIMD128-NEXT:    i32.extend8_s $push45=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $21
+; NO-SIMD128-NEXT:    i32.gt_s $push46=, $pop45, $pop44
+; NO-SIMD128-NEXT:    i32.select $push47=, $5, $21, $pop46
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop47
+; NO-SIMD128-NEXT:    i32.extend8_s $push49=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $20
+; NO-SIMD128-NEXT:    i32.gt_s $push50=, $pop49, $pop48
+; NO-SIMD128-NEXT:    i32.select $push51=, $4, $20, $pop50
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop51
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $19
 ; NO-SIMD128-NEXT:    i32.gt_s $push54=, $pop53, $pop52
-; NO-SIMD128-NEXT:    i32.select $push55=, $7, $23, $pop54
-; NO-SIMD128-NEXT:    i32.store8 0($pop57), $pop55
-; NO-SIMD128-NEXT:    i32.const $push62=, 5
-; NO-SIMD128-NEXT:    i32.add $push63=, $0, $pop62
-; NO-SIMD128-NEXT:    i32.extend8_s $push59=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push58=, $22
-; NO-SIMD128-NEXT:    i32.gt_s $push60=, $pop59, $pop58
-; NO-SIMD128-NEXT:    i32.select $push61=, $6, $22, $pop60
-; NO-SIMD128-NEXT:    i32.store8 0($pop63), $pop61
-; NO-SIMD128-NEXT:    i32.extend8_s $push65=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $21
-; NO-SIMD128-NEXT:    i32.gt_s $push66=, $pop65, $pop64
-; NO-SIMD128-NEXT:    i32.select $push67=, $5, $21, $pop66
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop67
-; NO-SIMD128-NEXT:    i32.const $push72=, 3
-; NO-SIMD128-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-NEXT:    i32.extend8_s $push69=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push68=, $20
-; NO-SIMD128-NEXT:    i32.gt_s $push70=, $pop69, $pop68
-; NO-SIMD128-NEXT:    i32.select $push71=, $4, $20, $pop70
-; NO-SIMD128-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-NEXT:    i32.extend8_s $push75=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push74=, $19
-; NO-SIMD128-NEXT:    i32.gt_s $push76=, $pop75, $pop74
-; NO-SIMD128-NEXT:    i32.select $push77=, $3, $19, $pop76
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop77
-; NO-SIMD128-NEXT:    i32.extend8_s $push79=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push78=, $18
-; NO-SIMD128-NEXT:    i32.gt_s $push80=, $pop79, $pop78
-; NO-SIMD128-NEXT:    i32.select $push81=, $2, $18, $pop80
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop81
-; NO-SIMD128-NEXT:    i32.extend8_s $push83=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push82=, $17
-; NO-SIMD128-NEXT:    i32.gt_s $push84=, $pop83, $pop82
-; NO-SIMD128-NEXT:    i32.select $push85=, $1, $17, $pop84
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop85
+; NO-SIMD128-NEXT:    i32.select $push55=, $3, $19, $pop54
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop55
+; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $18
+; NO-SIMD128-NEXT:    i32.gt_s $push58=, $pop57, $pop56
+; NO-SIMD128-NEXT:    i32.select $push59=, $2, $18, $pop58
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop59
+; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push60=, $17
+; NO-SIMD128-NEXT:    i32.gt_s $push62=, $pop61, $pop60
+; NO-SIMD128-NEXT:    i32.select $push63=, $1, $17, $pop62
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop63
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v16i8:
@@ -1207,93 +965,71 @@ define <16 x i8> @max_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $21
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $21
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $22
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $24
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $23, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $24
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $25
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $24, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $25
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.select $push35=, $9, $25, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push38=, $pop37, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.select $push39=, $10, $26, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push42=, $pop41, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $9, $25, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $26
+; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $11, $27, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $28
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push46=, $pop45, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $10, $26, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push51=, $11
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $27
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push52=, $pop51, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.select $push53=, $11, $27, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $28
+; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $12, $28, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push48=, $29
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push50=, $pop49, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.select $push51=, $13, $29, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push52=, $30
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.select $push55=, $14, $30, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop55
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $31
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $12, $28, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push63=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push62=, $29
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push64=, $pop63, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.select $push65=, $13, $29, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push68=, $30
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.select $push71=, $14, $30, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $0, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push75=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push74=, $31
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push76=, $pop75, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.select $push77=, $15, $31, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop79), $pop77
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $0, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push81=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push80=, $32
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push82=, $pop81, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.select $push83=, $16, $32, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop85), $pop83
+; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $15, $31, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push61=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $32
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push62=, $pop61, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.select $push63=, $16, $32, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop63
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1316,140 +1052,118 @@ define <16 x i8> @max_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: max_u_v16i8:
 ; NO-SIMD128:         .functype max_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
-; NO-SIMD128-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop117
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop95
 ; NO-SIMD128-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $16, $32, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop115
-; NO-SIMD128-NEXT:    i32.gt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $15, $31, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $14, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $30, $pop113
-; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $14, $30, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-NEXT:    i32.and $push20=, $13, $pop112
-; NO-SIMD128-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $29, $pop111
-; NO-SIMD128-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $13, $29, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $12, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-NEXT:    i32.and $push25=, $28, $pop109
-; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.select $push28=, $12, $28, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop107
-; NO-SIMD128-NEXT:    i32.gt_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.select $push34=, $11, $27, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-NEXT:    i32.and $push38=, $10, $pop106
-; NO-SIMD128-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $26, $pop105
-; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $10, $26, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-NEXT:    i32.and $push44=, $9, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $25, $pop103
-; NO-SIMD128-NEXT:    i32.gt_u $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.select $push46=, $9, $25, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-NEXT:    i32.and $push48=, $8, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $24, $pop101
-; NO-SIMD128-NEXT:    i32.gt_u $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.select $push50=, $8, $24, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push54=, $7, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push53=, $23, $pop99
-; NO-SIMD128-NEXT:    i32.gt_u $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.select $push56=, $7, $23, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push60=, $6, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push59=, $22, $pop97
-; NO-SIMD128-NEXT:    i32.gt_u $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.select $push62=, $6, $22, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $5, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push65=, $21, $pop95
-; NO-SIMD128-NEXT:    i32.gt_u $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.select $push68=, $5, $21, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push70=, $4, $pop94
+; NO-SIMD128-NEXT:    i32.and $push6=, $15, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push69=, $20, $pop93
-; NO-SIMD128-NEXT:    i32.gt_u $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.select $push72=, $4, $20, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.and $push5=, $31, $pop93
+; NO-SIMD128-NEXT:    i32.gt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $15, $31, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push76=, $3, $pop92
+; NO-SIMD128-NEXT:    i32.and $push10=, $14, $pop92
 ; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push75=, $19, $pop91
-; NO-SIMD128-NEXT:    i32.gt_u $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.select $push78=, $3, $19, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.and $push9=, $30, $pop91
+; NO-SIMD128-NEXT:    i32.gt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $14, $30, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push80=, $2, $pop90
+; NO-SIMD128-NEXT:    i32.and $push14=, $13, $pop90
 ; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push79=, $18, $pop89
-; NO-SIMD128-NEXT:    i32.gt_u $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.select $push82=, $2, $18, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.and $push13=, $29, $pop89
+; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $13, $29, $pop15
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
 ; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push84=, $1, $pop88
+; NO-SIMD128-NEXT:    i32.and $push18=, $12, $pop88
 ; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push83=, $17, $pop87
-; NO-SIMD128-NEXT:    i32.gt_u $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.select $push86=, $1, $17, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.and $push17=, $28, $pop87
+; NO-SIMD128-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $12, $28, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $11, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $27, $pop85
+; NO-SIMD128-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $11, $27, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $10, $pop84
+; NO-SIMD128-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $26, $pop83
+; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $10, $26, $pop27
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-NEXT:    i32.and $push30=, $9, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $25, $pop81
+; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $9, $25, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $8, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $24, $pop79
+; NO-SIMD128-NEXT:    i32.gt_u $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.select $push36=, $8, $24, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $7, $pop78
+; NO-SIMD128-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $23, $pop77
+; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $7, $23, $pop39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $6, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $22, $pop75
+; NO-SIMD128-NEXT:    i32.gt_u $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.select $push44=, $6, $22, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $5, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-NEXT:    i32.and $push45=, $21, $pop73
+; NO-SIMD128-NEXT:    i32.gt_u $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.select $push48=, $5, $21, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push50=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $20, $pop71
+; NO-SIMD128-NEXT:    i32.gt_u $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.select $push52=, $4, $20, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push54=, $3, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $19, $pop69
+; NO-SIMD128-NEXT:    i32.gt_u $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.select $push56=, $3, $19, $pop55
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $2, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $18, $pop67
+; NO-SIMD128-NEXT:    i32.gt_u $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.select $push60=, $2, $18, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push61=, $17, $pop65
+; NO-SIMD128-NEXT:    i32.gt_u $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.select $push64=, $1, $17, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v16i8:
@@ -1457,138 +1171,116 @@ define <16 x i8> @max_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop95
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $17, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop93
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $18, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $19, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop89
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $20, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $21, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $22, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $23, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $21, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $21, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $22, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $22, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $24, $pop81
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $23, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $24, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $24, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $9, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $25, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $24, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.select $push36=, $9, $25, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $10, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $26, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push40=, $10, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop75
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $9, $25, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $10, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $26, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $11, $27, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $12, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $28, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $10, $26, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $11, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $27, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.select $push54=, $11, $27, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $12, $28, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $13, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $29, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.select $push52=, $13, $29, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $14, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $30, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.select $push56=, $14, $30, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $15, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $31, $pop67
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $12, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $13, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $29, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.select $push66=, $13, $29, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push70=, $14, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $30, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.select $push72=, $14, $30, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push76=, $15, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $31, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.select $push78=, $15, $31, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push82=, $16, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $32, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.select $push84=, $16, $32, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $15, $31, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $16, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $32, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.select $push64=, $16, $32, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1611,156 +1303,134 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v16i8:
 ; NO-SIMD128:         .functype avgr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 254
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop133
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop132
-; NO-SIMD128-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop131
-; NO-SIMD128-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop130
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop129
-; NO-SIMD128-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop128
-; NO-SIMD128-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop127
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.add $push22=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop126
-; NO-SIMD128-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop125
-; NO-SIMD128-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop124
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $pop25
-; NO-SIMD128-NEXT:    i32.const $push26=, 11
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.add $push28=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop123
-; NO-SIMD128-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $pop122
-; NO-SIMD128-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop121
-; NO-SIMD128-NEXT:    i32.store8 0($pop27), $pop31
-; NO-SIMD128-NEXT:    i32.const $push32=, 10
-; NO-SIMD128-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-NEXT:    i32.add $push34=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop120
-; NO-SIMD128-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop119
-; NO-SIMD128-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop118
-; NO-SIMD128-NEXT:    i32.store8 0($pop33), $pop37
-; NO-SIMD128-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-NEXT:    i32.add $push40=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-NEXT:    i32.add $push41=, $pop40, $pop117
-; NO-SIMD128-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-NEXT:    i32.and $push42=, $pop41, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop115
-; NO-SIMD128-NEXT:    i32.store8 0($pop39), $pop43
-; NO-SIMD128-NEXT:    i32.add $push44=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $pop113
-; NO-SIMD128-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push47=, $pop46, $pop112
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop47
-; NO-SIMD128-NEXT:    i32.const $push48=, 7
-; NO-SIMD128-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-NEXT:    i32.add $push50=, $8, $24
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 254
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop111
-; NO-SIMD128-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop109
-; NO-SIMD128-NEXT:    i32.store8 0($pop49), $pop53
-; NO-SIMD128-NEXT:    i32.const $push54=, 6
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.add $push56=, $7, $23
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop111
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop109
 ; NO-SIMD128-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-NEXT:    i32.add $push57=, $pop56, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-NEXT:    i32.and $push58=, $pop57, $pop107
-; NO-SIMD128-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop106
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop59
-; NO-SIMD128-NEXT:    i32.const $push60=, 5
-; NO-SIMD128-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-NEXT:    i32.add $push62=, $6, $22
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop108
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop105
-; NO-SIMD128-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop103
-; NO-SIMD128-NEXT:    i32.store8 0($pop61), $pop65
-; NO-SIMD128-NEXT:    i32.add $push66=, $5, $21
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-NEXT:    i32.add $push67=, $pop66, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $pop101
-; NO-SIMD128-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop100
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop69
-; NO-SIMD128-NEXT:    i32.const $push70=, 3
-; NO-SIMD128-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-NEXT:    i32.add $push72=, $4, $20
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-NEXT:    i32.add $push73=, $pop72, $pop99
-; NO-SIMD128-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-NEXT:    i32.and $push74=, $pop73, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push75=, $pop74, $pop97
-; NO-SIMD128-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-NEXT:    i32.add $push76=, $3, $19
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-NEXT:    i32.add $push77=, $pop76, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-NEXT:    i32.and $push78=, $pop77, $pop95
-; NO-SIMD128-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop94
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop79
-; NO-SIMD128-NEXT:    i32.add $push80=, $2, $18
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-NEXT:    i32.add $push81=, $pop80, $pop93
-; NO-SIMD128-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-NEXT:    i32.and $push82=, $pop81, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push83=, $pop82, $pop91
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop83
-; NO-SIMD128-NEXT:    i32.add $push84=, $1, $17
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop87
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop33
+; NO-SIMD128-NEXT:    i32.add $push34=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop37
+; NO-SIMD128-NEXT:    i32.add $push38=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push42=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop45
+; NO-SIMD128-NEXT:    i32.add $push46=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop49
+; NO-SIMD128-NEXT:    i32.add $push50=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop53
+; NO-SIMD128-NEXT:    i32.add $push54=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop57
+; NO-SIMD128-NEXT:    i32.add $push58=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop61
+; NO-SIMD128-NEXT:    i32.add $push62=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop65
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v16i8:
@@ -1771,151 +1441,129 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 254
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop111
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop132
-; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop131
-; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop108
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop129
-; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop128
-; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop127
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop126
-; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop125
-; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop124
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop123
-; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop122
-; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop121
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop120
-; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop119
-; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop118
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop115
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop113
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop41
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $pop49, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push53=, $0, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop53), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $pop60, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $pop61, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $pop66, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push69=, $pop68, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop69
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.add $push72=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $pop72, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push74=, $pop73, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push77=, $0, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.add $push78=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push80=, $pop79, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop77), $pop81
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push83=, $0, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.add $push84=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop83), $pop87
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.add $push58=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop65
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add nuw <16 x i8> %x, %y
   %b = add nuw <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
@@ -1949,156 +1597,134 @@ define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v16i8_wrap:
 ; NO-SIMD128:         .functype avgr_u_v16i8_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 254
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop133
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop132
-; NO-SIMD128-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop131
-; NO-SIMD128-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop130
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop129
-; NO-SIMD128-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop128
-; NO-SIMD128-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop127
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.add $push22=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop126
-; NO-SIMD128-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop125
-; NO-SIMD128-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop124
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $pop25
-; NO-SIMD128-NEXT:    i32.const $push26=, 11
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.add $push28=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop123
-; NO-SIMD128-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $pop122
-; NO-SIMD128-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop121
-; NO-SIMD128-NEXT:    i32.store8 0($pop27), $pop31
-; NO-SIMD128-NEXT:    i32.const $push32=, 10
-; NO-SIMD128-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-NEXT:    i32.add $push34=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop120
-; NO-SIMD128-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop119
-; NO-SIMD128-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop118
-; NO-SIMD128-NEXT:    i32.store8 0($pop33), $pop37
-; NO-SIMD128-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-NEXT:    i32.add $push40=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-NEXT:    i32.add $push41=, $pop40, $pop117
-; NO-SIMD128-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-NEXT:    i32.and $push42=, $pop41, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop115
-; NO-SIMD128-NEXT:    i32.store8 0($pop39), $pop43
-; NO-SIMD128-NEXT:    i32.add $push44=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $pop113
-; NO-SIMD128-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push47=, $pop46, $pop112
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop47
-; NO-SIMD128-NEXT:    i32.const $push48=, 7
-; NO-SIMD128-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-NEXT:    i32.add $push50=, $8, $24
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 254
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop111
-; NO-SIMD128-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop109
-; NO-SIMD128-NEXT:    i32.store8 0($pop49), $pop53
-; NO-SIMD128-NEXT:    i32.const $push54=, 6
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.add $push56=, $7, $23
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop111
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop109
 ; NO-SIMD128-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-NEXT:    i32.add $push57=, $pop56, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-NEXT:    i32.and $push58=, $pop57, $pop107
-; NO-SIMD128-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop106
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop59
-; NO-SIMD128-NEXT:    i32.const $push60=, 5
-; NO-SIMD128-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-NEXT:    i32.add $push62=, $6, $22
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop108
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop105
-; NO-SIMD128-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop103
-; NO-SIMD128-NEXT:    i32.store8 0($pop61), $pop65
-; NO-SIMD128-NEXT:    i32.add $push66=, $5, $21
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-NEXT:    i32.add $push67=, $pop66, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $pop101
-; NO-SIMD128-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop100
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop69
-; NO-SIMD128-NEXT:    i32.const $push70=, 3
-; NO-SIMD128-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-NEXT:    i32.add $push72=, $4, $20
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-NEXT:    i32.add $push73=, $pop72, $pop99
-; NO-SIMD128-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-NEXT:    i32.and $push74=, $pop73, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push75=, $pop74, $pop97
-; NO-SIMD128-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-NEXT:    i32.add $push76=, $3, $19
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-NEXT:    i32.add $push77=, $pop76, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-NEXT:    i32.and $push78=, $pop77, $pop95
-; NO-SIMD128-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop94
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop79
-; NO-SIMD128-NEXT:    i32.add $push80=, $2, $18
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-NEXT:    i32.add $push81=, $pop80, $pop93
-; NO-SIMD128-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-NEXT:    i32.and $push82=, $pop81, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push83=, $pop82, $pop91
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop83
-; NO-SIMD128-NEXT:    i32.add $push84=, $1, $17
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop87
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop33
+; NO-SIMD128-NEXT:    i32.add $push34=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop37
+; NO-SIMD128-NEXT:    i32.add $push38=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push42=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop45
+; NO-SIMD128-NEXT:    i32.add $push46=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop49
+; NO-SIMD128-NEXT:    i32.add $push50=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop53
+; NO-SIMD128-NEXT:    i32.add $push54=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop57
+; NO-SIMD128-NEXT:    i32.add $push58=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop61
+; NO-SIMD128-NEXT:    i32.add $push62=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop65
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v16i8_wrap:
@@ -2109,151 +1735,129 @@ define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 254
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop111
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop132
-; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop131
-; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop108
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop129
-; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop128
-; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop127
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop126
-; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop125
-; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop124
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop123
-; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop122
-; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop121
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop120
-; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop119
-; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop118
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop115
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop113
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop41
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $pop49, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push53=, $0, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop53), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $pop60, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $pop61, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $pop66, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push69=, $pop68, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop69
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.add $push72=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $pop72, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push74=, $pop73, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push77=, $0, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.add $push78=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push80=, $pop79, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop77), $pop81
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push83=, $0, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.add $push84=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop83), $pop87
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.add $push58=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop65
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <16 x i8> %x, %y
   %b = add <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
@@ -2279,140 +1883,118 @@ define <16 x i8> @abs_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-LABEL: abs_v16i8:
 ; NO-SIMD128:         .functype abs_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.const $push1=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push117=, $pop0, $pop1
-; NO-SIMD128-NEXT:    local.tee $push116=, $17=, $pop117
-; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop116
+; NO-SIMD128-NEXT:    i32.shr_s $push95=, $pop0, $pop1
+; NO-SIMD128-NEXT:    local.tee $push94=, $17=, $pop95
+; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop94
 ; NO-SIMD128-NEXT:    i32.sub $push3=, $pop2, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.const $push115=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push114=, $pop6, $pop115
-; NO-SIMD128-NEXT:    local.tee $push113=, $16=, $pop114
-; NO-SIMD128-NEXT:    i32.xor $push7=, $15, $pop113
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $14
-; NO-SIMD128-NEXT:    i32.const $push112=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push111=, $pop11, $pop112
-; NO-SIMD128-NEXT:    local.tee $push110=, $16=, $pop111
-; NO-SIMD128-NEXT:    i32.xor $push12=, $14, $pop110
-; NO-SIMD128-NEXT:    i32.sub $push13=, $pop12, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop13
-; NO-SIMD128-NEXT:    i32.const $push19=, 12
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $13
-; NO-SIMD128-NEXT:    i32.const $push109=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push108=, $pop16, $pop109
-; NO-SIMD128-NEXT:    local.tee $push107=, $16=, $pop108
-; NO-SIMD128-NEXT:    i32.xor $push17=, $13, $pop107
-; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $12
-; NO-SIMD128-NEXT:    i32.const $push106=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push105=, $pop21, $pop106
-; NO-SIMD128-NEXT:    local.tee $push104=, $16=, $pop105
-; NO-SIMD128-NEXT:    i32.xor $push22=, $12, $pop104
-; NO-SIMD128-NEXT:    i32.sub $push23=, $pop22, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 10
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push26=, $11
-; NO-SIMD128-NEXT:    i32.const $push103=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push102=, $pop26, $pop103
-; NO-SIMD128-NEXT:    local.tee $push101=, $16=, $pop102
-; NO-SIMD128-NEXT:    i32.xor $push27=, $11, $pop101
-; NO-SIMD128-NEXT:    i32.sub $push28=, $pop27, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 9
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $10
-; NO-SIMD128-NEXT:    i32.const $push100=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push99=, $pop31, $pop100
-; NO-SIMD128-NEXT:    local.tee $push98=, $16=, $pop99
-; NO-SIMD128-NEXT:    i32.xor $push32=, $10, $pop98
-; NO-SIMD128-NEXT:    i32.sub $push33=, $pop32, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.const $push97=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push96=, $pop36, $pop97
-; NO-SIMD128-NEXT:    local.tee $push95=, $16=, $pop96
-; NO-SIMD128-NEXT:    i32.xor $push37=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.sub $push38=, $pop37, $16
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop38
-; NO-SIMD128-NEXT:    i32.const $push94=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop94
-; NO-SIMD128-NEXT:    i32.extend8_s $push39=, $8
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $15
 ; NO-SIMD128-NEXT:    i32.const $push93=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push92=, $pop39, $pop93
+; NO-SIMD128-NEXT:    i32.shr_s $push92=, $pop4, $pop93
 ; NO-SIMD128-NEXT:    local.tee $push91=, $16=, $pop92
-; NO-SIMD128-NEXT:    i32.xor $push40=, $8, $pop91
-; NO-SIMD128-NEXT:    i32.sub $push41=, $pop40, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop41
-; NO-SIMD128-NEXT:    i32.const $push46=, 6
-; NO-SIMD128-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $pop91
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $14
 ; NO-SIMD128-NEXT:    i32.const $push90=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push89=, $pop43, $pop90
+; NO-SIMD128-NEXT:    i32.shr_s $push89=, $pop7, $pop90
 ; NO-SIMD128-NEXT:    local.tee $push88=, $16=, $pop89
-; NO-SIMD128-NEXT:    i32.xor $push44=, $7, $pop88
-; NO-SIMD128-NEXT:    i32.sub $push45=, $pop44, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop47), $pop45
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $14, $pop88
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop8, $16
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $13
 ; NO-SIMD128-NEXT:    i32.const $push87=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push86=, $pop48, $pop87
+; NO-SIMD128-NEXT:    i32.shr_s $push86=, $pop10, $pop87
 ; NO-SIMD128-NEXT:    local.tee $push85=, $16=, $pop86
-; NO-SIMD128-NEXT:    i32.xor $push49=, $6, $pop85
-; NO-SIMD128-NEXT:    i32.sub $push50=, $pop49, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $13, $pop85
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop11, $16
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $12
 ; NO-SIMD128-NEXT:    i32.const $push84=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push83=, $pop53, $pop84
+; NO-SIMD128-NEXT:    i32.shr_s $push83=, $pop13, $pop84
 ; NO-SIMD128-NEXT:    local.tee $push82=, $16=, $pop83
-; NO-SIMD128-NEXT:    i32.xor $push54=, $5, $pop82
-; NO-SIMD128-NEXT:    i32.sub $push55=, $pop54, $16
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop55
-; NO-SIMD128-NEXT:    i32.const $push59=, 3
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $12, $pop82
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop14, $16
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $11
 ; NO-SIMD128-NEXT:    i32.const $push81=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push80=, $pop56, $pop81
+; NO-SIMD128-NEXT:    i32.shr_s $push80=, $pop16, $pop81
 ; NO-SIMD128-NEXT:    local.tee $push79=, $16=, $pop80
-; NO-SIMD128-NEXT:    i32.xor $push57=, $4, $pop79
-; NO-SIMD128-NEXT:    i32.sub $push58=, $pop57, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $11, $pop79
+; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $16
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $10
 ; NO-SIMD128-NEXT:    i32.const $push78=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push77=, $pop61, $pop78
+; NO-SIMD128-NEXT:    i32.shr_s $push77=, $pop19, $pop78
 ; NO-SIMD128-NEXT:    local.tee $push76=, $16=, $pop77
-; NO-SIMD128-NEXT:    i32.xor $push62=, $3, $pop76
-; NO-SIMD128-NEXT:    i32.sub $push63=, $pop62, $16
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop63
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $10, $pop76
+; NO-SIMD128-NEXT:    i32.sub $push21=, $pop20, $16
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $9
 ; NO-SIMD128-NEXT:    i32.const $push75=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push74=, $pop64, $pop75
+; NO-SIMD128-NEXT:    i32.shr_s $push74=, $pop22, $pop75
 ; NO-SIMD128-NEXT:    local.tee $push73=, $16=, $pop74
-; NO-SIMD128-NEXT:    i32.xor $push65=, $2, $pop73
-; NO-SIMD128-NEXT:    i32.sub $push66=, $pop65, $16
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop66
-; NO-SIMD128-NEXT:    i32.extend8_s $push67=, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.sub $push24=, $pop23, $16
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $8
 ; NO-SIMD128-NEXT:    i32.const $push72=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push71=, $pop67, $pop72
+; NO-SIMD128-NEXT:    i32.shr_s $push71=, $pop25, $pop72
 ; NO-SIMD128-NEXT:    local.tee $push70=, $16=, $pop71
-; NO-SIMD128-NEXT:    i32.xor $push68=, $1, $pop70
-; NO-SIMD128-NEXT:    i32.sub $push69=, $pop68, $16
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop69
+; NO-SIMD128-NEXT:    i32.xor $push26=, $8, $pop70
+; NO-SIMD128-NEXT:    i32.sub $push27=, $pop26, $16
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $7
+; NO-SIMD128-NEXT:    i32.const $push69=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push68=, $pop28, $pop69
+; NO-SIMD128-NEXT:    local.tee $push67=, $16=, $pop68
+; NO-SIMD128-NEXT:    i32.xor $push29=, $7, $pop67
+; NO-SIMD128-NEXT:    i32.sub $push30=, $pop29, $16
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $6
+; NO-SIMD128-NEXT:    i32.const $push66=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push65=, $pop31, $pop66
+; NO-SIMD128-NEXT:    local.tee $push64=, $16=, $pop65
+; NO-SIMD128-NEXT:    i32.xor $push32=, $6, $pop64
+; NO-SIMD128-NEXT:    i32.sub $push33=, $pop32, $16
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.extend8_s $push34=, $5
+; NO-SIMD128-NEXT:    i32.const $push63=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push62=, $pop34, $pop63
+; NO-SIMD128-NEXT:    local.tee $push61=, $16=, $pop62
+; NO-SIMD128-NEXT:    i32.xor $push35=, $5, $pop61
+; NO-SIMD128-NEXT:    i32.sub $push36=, $pop35, $16
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $4
+; NO-SIMD128-NEXT:    i32.const $push60=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push59=, $pop37, $pop60
+; NO-SIMD128-NEXT:    local.tee $push58=, $16=, $pop59
+; NO-SIMD128-NEXT:    i32.xor $push38=, $4, $pop58
+; NO-SIMD128-NEXT:    i32.sub $push39=, $pop38, $16
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $3
+; NO-SIMD128-NEXT:    i32.const $push57=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push56=, $pop40, $pop57
+; NO-SIMD128-NEXT:    local.tee $push55=, $16=, $pop56
+; NO-SIMD128-NEXT:    i32.xor $push41=, $3, $pop55
+; NO-SIMD128-NEXT:    i32.sub $push42=, $pop41, $16
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $2
+; NO-SIMD128-NEXT:    i32.const $push54=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push53=, $pop43, $pop54
+; NO-SIMD128-NEXT:    local.tee $push52=, $16=, $pop53
+; NO-SIMD128-NEXT:    i32.xor $push44=, $2, $pop52
+; NO-SIMD128-NEXT:    i32.sub $push45=, $pop44, $16
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $1
+; NO-SIMD128-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push50=, $pop46, $pop51
+; NO-SIMD128-NEXT:    local.tee $push49=, $16=, $pop50
+; NO-SIMD128-NEXT:    i32.xor $push47=, $1, $pop49
+; NO-SIMD128-NEXT:    i32.sub $push48=, $pop47, $16
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v16i8:
@@ -2420,138 +2002,116 @@ define <16 x i8> @abs_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push0=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push117=, $pop0, $pop1
-; NO-SIMD128-FAST-NEXT:    local.tee $push116=, $17=, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push95=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    local.tee $push94=, $17=, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop2, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push4=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push114=, $pop4, $pop115
-; NO-SIMD128-FAST-NEXT:    local.tee $push113=, $1=, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push92=, $pop4, $pop93
+; NO-SIMD128-FAST-NEXT:    local.tee $push91=, $1=, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push111=, $pop7, $pop112
-; NO-SIMD128-FAST-NEXT:    local.tee $push110=, $2=, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push89=, $pop7, $pop90
+; NO-SIMD128-FAST-NEXT:    local.tee $push88=, $2=, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop88
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop8, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push108=, $pop10, $pop109
-; NO-SIMD128-FAST-NEXT:    local.tee $push107=, $3=, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push105=, $pop15, $pop106
-; NO-SIMD128-FAST-NEXT:    local.tee $push104=, $4=, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $5, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop16, $4
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push102=, $pop18, $pop103
-; NO-SIMD128-FAST-NEXT:    local.tee $push101=, $5=, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $6, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop19, $5
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push99=, $pop23, $pop100
-; NO-SIMD128-FAST-NEXT:    local.tee $push98=, $6=, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $7, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $pop24, $6
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push95=, $pop28, $pop96
-; NO-SIMD128-FAST-NEXT:    local.tee $push94=, $7=, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $7
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $9
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push92=, $pop32, $pop93
-; NO-SIMD128-FAST-NEXT:    local.tee $push91=, $8=, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $9, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.sub $push34=, $pop33, $8
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push89=, $pop35, $pop90
-; NO-SIMD128-FAST-NEXT:    local.tee $push88=, $9=, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $10, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.sub $push37=, $pop36, $9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $11
 ; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push86=, $pop40, $pop87
-; NO-SIMD128-FAST-NEXT:    local.tee $push85=, $10=, $pop86
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $11, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.sub $push42=, $pop41, $10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push86=, $pop10, $pop87
+; NO-SIMD128-FAST-NEXT:    local.tee $push85=, $3=, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push83=, $pop45, $pop84
-; NO-SIMD128-FAST-NEXT:    local.tee $push82=, $11=, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.sub $push47=, $pop46, $11
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $13
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push83=, $pop13, $pop84
+; NO-SIMD128-FAST-NEXT:    local.tee $push82=, $4=, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $5, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop14, $4
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push80=, $pop50, $pop81
-; NO-SIMD128-FAST-NEXT:    local.tee $push79=, $12=, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.xor $push51=, $13, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.sub $push52=, $pop51, $12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop54), $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push55=, $14
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push80=, $pop16, $pop81
+; NO-SIMD128-FAST-NEXT:    local.tee $push79=, $5=, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $6, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.sub $push18=, $pop17, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
 ; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push77=, $pop55, $pop78
-; NO-SIMD128-FAST-NEXT:    local.tee $push76=, $13=, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.xor $push56=, $14, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.sub $push57=, $pop56, $13
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push77=, $pop19, $pop78
+; NO-SIMD128-FAST-NEXT:    local.tee $push76=, $6=, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $7, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.sub $push21=, $pop20, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push74=, $pop60, $pop75
-; NO-SIMD128-FAST-NEXT:    local.tee $push73=, $14=, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.xor $push61=, $15, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.sub $push62=, $pop61, $14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push69=, $0, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push65=, $16
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push74=, $pop22, $pop75
+; NO-SIMD128-FAST-NEXT:    local.tee $push73=, $7=, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $8, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.sub $push24=, $pop23, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push71=, $pop65, $pop72
-; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $0=, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.xor $push66=, $16, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.sub $push67=, $pop66, $0
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop69), $pop67
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push71=, $pop25, $pop72
+; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $8=, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $9, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.sub $push27=, $pop26, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push68=, $pop28, $pop69
+; NO-SIMD128-FAST-NEXT:    local.tee $push67=, $9=, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $10, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push65=, $pop31, $pop66
+; NO-SIMD128-FAST-NEXT:    local.tee $push64=, $10=, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $11, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.sub $push33=, $pop32, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push62=, $pop34, $pop63
+; NO-SIMD128-FAST-NEXT:    local.tee $push61=, $11=, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $12, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.sub $push36=, $pop35, $11
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push59=, $pop37, $pop60
+; NO-SIMD128-FAST-NEXT:    local.tee $push58=, $12=, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $13, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.sub $push39=, $pop38, $12
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push56=, $pop40, $pop57
+; NO-SIMD128-FAST-NEXT:    local.tee $push55=, $13=, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $14, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.sub $push42=, $pop41, $13
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push43=, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push53=, $pop43, $pop54
+; NO-SIMD128-FAST-NEXT:    local.tee $push52=, $14=, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $15, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.sub $push45=, $pop44, $14
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push46=, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop46, $pop51
+; NO-SIMD128-FAST-NEXT:    local.tee $push49=, $15=, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $16, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.sub $push48=, $pop47, $15
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> zeroinitializer, %x
   %b = icmp slt <16 x i8> %x, zeroinitializer
@@ -2576,75 +2136,53 @@ define <16 x i8> @neg_v16i8(<16 x i8> %x) {
 ; NO-SIMD128:         .functype neg_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $9
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop53, $5
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop52, $3
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop51, $2
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, 0
-; NO-SIMD128-NEXT:    i32.sub $push5=, $pop50, $1
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, 0
-; NO-SIMD128-NEXT:    i32.sub $push6=, $pop49, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, 0
-; NO-SIMD128-NEXT:    i32.sub $push9=, $pop48, $15
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, 0
-; NO-SIMD128-NEXT:    i32.sub $push12=, $pop47, $14
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, 0
-; NO-SIMD128-NEXT:    i32.sub $push15=, $pop46, $13
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, 0
-; NO-SIMD128-NEXT:    i32.sub $push18=, $pop45, $12
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, 0
-; NO-SIMD128-NEXT:    i32.sub $push21=, $pop44, $11
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, 0
-; NO-SIMD128-NEXT:    i32.sub $push24=, $pop43, $10
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, 0
-; NO-SIMD128-NEXT:    i32.sub $push27=, $pop42, $8
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, 0
-; NO-SIMD128-NEXT:    i32.sub $push30=, $pop41, $7
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push40=, 0
-; NO-SIMD128-NEXT:    i32.sub $push33=, $pop40, $6
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push39=, 0
-; NO-SIMD128-NEXT:    i32.sub $push36=, $pop39, $4
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $16
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop31, $15
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop30, $14
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop29, $13
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, 0
+; NO-SIMD128-NEXT:    i32.sub $push5=, $pop28, $12
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, 0
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop27, $11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, 0
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop26, $10
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, 0
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop25, $9
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, 0
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, 0
+; NO-SIMD128-NEXT:    i32.sub $push10=, $pop23, $7
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, 0
+; NO-SIMD128-NEXT:    i32.sub $push11=, $pop22, $6
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, 0
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop21, $5
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, 0
+; NO-SIMD128-NEXT:    i32.sub $push13=, $pop20, $4
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, 0
+; NO-SIMD128-NEXT:    i32.sub $push14=, $pop19, $3
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 0
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, 0
+; NO-SIMD128-NEXT:    i32.sub $push16=, $pop17, $1
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v16i8:
@@ -2653,73 +2191,51 @@ define <16 x i8> @neg_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop53, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop31, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop52, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop30, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop51, $4
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop50, $5
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop49, $6
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop48, $7
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop47, $8
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop46, $9
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop45, $10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push23=, $pop44, $11
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push26=, $pop43, $12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push29=, $pop42, $13
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push32=, $pop41, $14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push35=, $pop40, $15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push38=, $pop39, $16
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop29, $4
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $pop28, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop27, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop26, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop25, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop23, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push11=, $pop22, $11
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop21, $12
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop20, $13
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push14=, $pop19, $14
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop18, $15
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop17, $16
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
                       i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
@@ -2744,124 +2260,80 @@ define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128:         .functype shl_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push40=, $17, $pop0
-; NO-SIMD128-NEXT:    local.tee $push39=, $17=, $pop40
-; NO-SIMD128-NEXT:    i32.shl $push1=, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.shl $push3=, $3, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.shl $push4=, $2, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.shl $push5=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.shl $push6=, $16, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.shl $push9=, $15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.shl $push12=, $14, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.shl $push15=, $13, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.shl $push18=, $12, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.shl $push21=, $11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.shl $push24=, $10, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.shl $push27=, $8, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.shl $push30=, $7, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.shl $push33=, $6, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.shl $push36=, $4, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.and $push18=, $17, $pop0
+; NO-SIMD128-NEXT:    local.tee $push17=, $17=, $pop18
+; NO-SIMD128-NEXT:    i32.shl $push1=, $16, $pop17
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $15, $17
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $14, $17
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push4=, $13, $17
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push5=, $12, $17
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $11, $17
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.shl $push7=, $10, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.shl $push8=, $9, $17
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.shl $push9=, $8, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.shl $push10=, $7, $17
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.shl $push12=, $5, $17
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.shl $push13=, $4, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.shl $push14=, $3, $17
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.shl $push15=, $2, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v16i8:
 ; NO-SIMD128-FAST:         .functype shl_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $17, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push39=, $17=, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $17=, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.shl $push17=, $9, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $10, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.shl $push23=, $11, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $12, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.shl $push29=, $13, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.shl $push32=, $14, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shl $push35=, $15, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.shl $push38=, $16, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $9, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $10, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.shl $push11=, $11, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $12, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $13, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $14, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $15, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $16, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -2890,75 +2362,53 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
 ; NO-SIMD128:         .functype shl_const_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $pop53
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $3, $pop52
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $2, $pop51
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, 5
-; NO-SIMD128-NEXT:    i32.shl $push5=, $1, $pop50
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.shl $push6=, $16, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, 5
-; NO-SIMD128-NEXT:    i32.shl $push9=, $15, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, 5
-; NO-SIMD128-NEXT:    i32.shl $push12=, $14, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, 5
-; NO-SIMD128-NEXT:    i32.shl $push15=, $13, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, 5
-; NO-SIMD128-NEXT:    i32.shl $push18=, $12, $pop45
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, 5
-; NO-SIMD128-NEXT:    i32.shl $push21=, $11, $pop44
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, 5
-; NO-SIMD128-NEXT:    i32.shl $push24=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, 5
-; NO-SIMD128-NEXT:    i32.shl $push27=, $8, $pop42
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, 5
-; NO-SIMD128-NEXT:    i32.shl $push30=, $7, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push40=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.const $push39=, 5
-; NO-SIMD128-NEXT:    i32.shl $push33=, $6, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop33
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.const $push38=, 5
-; NO-SIMD128-NEXT:    i32.shl $push35=, $4, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.shl $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $15, $pop31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $14, $pop30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $13, $pop29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, 5
+; NO-SIMD128-NEXT:    i32.shl $push5=, $12, $pop28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, 5
+; NO-SIMD128-NEXT:    i32.shl $push7=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, 5
+; NO-SIMD128-NEXT:    i32.shl $push8=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, 5
+; NO-SIMD128-NEXT:    i32.shl $push9=, $8, $pop24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, 5
+; NO-SIMD128-NEXT:    i32.shl $push10=, $7, $pop23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, 5
+; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $pop22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-NEXT:    i32.shl $push12=, $5, $pop21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, 5
+; NO-SIMD128-NEXT:    i32.shl $push13=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, 5
+; NO-SIMD128-NEXT:    i32.shl $push14=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.shl $push15=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v16i8:
@@ -2967,73 +2417,51 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $6, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $7, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $8, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $9, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push19=, $10, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push22=, $11, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push25=, $12, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push28=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push31=, $14, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push34=, $15, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push37=, $16, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $9, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $10, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push11=, $11, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <16 x i8> %v,
     <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5,
@@ -3248,91 +2676,69 @@ define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128:         .functype shl_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop0
-; NO-SIMD128-NEXT:    i32.shl $push2=, $9, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-NEXT:    i32.and $push3=, $21, $pop69
-; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $pop3
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $19, $pop68
-; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $pop5
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $18, $pop67
-; NO-SIMD128-NEXT:    i32.shl $push8=, $2, $pop7
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-NEXT:    i32.and $push9=, $17, $pop66
-; NO-SIMD128-NEXT:    i32.shl $push10=, $1, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $32, $pop65
-; NO-SIMD128-NEXT:    i32.shl $push12=, $16, $pop11
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-NEXT:    i32.and $push15=, $31, $pop64
-; NO-SIMD128-NEXT:    i32.shl $push16=, $15, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $30, $pop63
-; NO-SIMD128-NEXT:    i32.shl $push20=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $29, $pop62
-; NO-SIMD128-NEXT:    i32.shl $push24=, $13, $pop23
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $28, $pop61
-; NO-SIMD128-NEXT:    i32.shl $push28=, $12, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop60
-; NO-SIMD128-NEXT:    i32.shl $push32=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-NEXT:    i32.and $push35=, $26, $pop59
-; NO-SIMD128-NEXT:    i32.shl $push36=, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-NEXT:    i32.and $push39=, $24, $pop58
-; NO-SIMD128-NEXT:    i32.shl $push40=, $8, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $23, $pop57
-; NO-SIMD128-NEXT:    i32.shl $push44=, $7, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $22, $pop56
-; NO-SIMD128-NEXT:    i32.shl $push48=, $6, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $20, $pop55
-; NO-SIMD128-NEXT:    i32.shl $push52=, $4, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop0
+; NO-SIMD128-NEXT:    i32.shl $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-NEXT:    i32.and $push3=, $31, $pop47
+; NO-SIMD128-NEXT:    i32.shl $push4=, $15, $pop3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $30, $pop46
+; NO-SIMD128-NEXT:    i32.shl $push6=, $14, $pop5
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $29, $pop45
+; NO-SIMD128-NEXT:    i32.shl $push8=, $13, $pop7
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-NEXT:    i32.and $push9=, $28, $pop44
+; NO-SIMD128-NEXT:    i32.shl $push10=, $12, $pop9
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $27, $pop43
+; NO-SIMD128-NEXT:    i32.shl $push12=, $11, $pop11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $26, $pop42
+; NO-SIMD128-NEXT:    i32.shl $push14=, $10, $pop13
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-NEXT:    i32.and $push15=, $25, $pop41
+; NO-SIMD128-NEXT:    i32.shl $push16=, $9, $pop15
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $24, $pop40
+; NO-SIMD128-NEXT:    i32.shl $push18=, $8, $pop17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $23, $pop39
+; NO-SIMD128-NEXT:    i32.shl $push20=, $7, $pop19
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $22, $pop38
+; NO-SIMD128-NEXT:    i32.shl $push22=, $6, $pop21
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $21, $pop37
+; NO-SIMD128-NEXT:    i32.shl $push24=, $5, $pop23
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $20, $pop36
+; NO-SIMD128-NEXT:    i32.shl $push26=, $4, $pop25
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $19, $pop35
+; NO-SIMD128-NEXT:    i32.shl $push28=, $3, $pop27
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $18, $pop34
+; NO-SIMD128-NEXT:    i32.shl $push30=, $2, $pop29
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop33
+; NO-SIMD128-NEXT:    i32.shl $push32=, $1, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v16i8:
@@ -3342,88 +2748,66 @@ define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $18, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $18, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $19, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $19, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $20, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $21, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $22, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $24, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $9, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $26, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.shl $push30=, $10, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $27, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.shl $push34=, $11, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.shl $push38=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $29, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.shl $push42=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $30, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.shl $push46=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $31, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shl $push50=, $15, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $32, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.shl $push54=, $16, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $20, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $21, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $22, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $23, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $24, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $25, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.shl $push18=, $9, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $10, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.shl $push22=, $11, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $12, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $29, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $13, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $30, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.shl $push28=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $31, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shl $push30=, $15, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $32, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shl $push32=, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -3445,79 +2829,57 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: shr_s_v16i8:
 ; NO-SIMD128:         .functype shr_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $17, $pop0
-; NO-SIMD128-NEXT:    local.tee $push55=, $17=, $pop56
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop55
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $5
+; NO-SIMD128-NEXT:    i32.and $push34=, $17, $pop0
+; NO-SIMD128-NEXT:    local.tee $push33=, $17=, $pop34
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop33
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $15
 ; NO-SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $14
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $2
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $13
 ; NO-SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $1
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $12
 ; NO-SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $16
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $11
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $15
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $10
+; NO-SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $9
 ; NO-SIMD128-NEXT:    i32.shr_s $push16=, $pop15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $14
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $8
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $7
 ; NO-SIMD128-NEXT:    i32.shr_s $push20=, $pop19, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $13
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-NEXT:    i32.shr_s $push22=, $pop21, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $5
 ; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $12
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $4
+; NO-SIMD128-NEXT:    i32.shr_s $push26=, $pop25, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $3
 ; NO-SIMD128-NEXT:    i32.shr_s $push28=, $pop27, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $2
+; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $1
 ; NO-SIMD128-NEXT:    i32.shr_s $push32=, $pop31, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.extend8_s $push35=, $10
-; NO-SIMD128-NEXT:    i32.shr_s $push36=, $pop35, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.extend8_s $push39=, $8
-; NO-SIMD128-NEXT:    i32.shr_s $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $7
-; NO-SIMD128-NEXT:    i32.shr_s $push44=, $pop43, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $6
-; NO-SIMD128-NEXT:    i32.shr_s $push48=, $pop47, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.extend8_s $push51=, $4
-; NO-SIMD128-NEXT:    i32.shr_s $push52=, $pop51, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v16i8:
@@ -3525,9 +2887,9 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push1=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $17, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push55=, $1=, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push33=, $1=, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push3=, $2
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push4=, $pop3, $1
@@ -3535,67 +2897,45 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $3
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $4
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push8=, $pop7, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $pop9, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $pop15, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $9
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $10
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $13
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push26=, $pop25, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push27=, $14
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push28=, $pop27, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $15
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $11
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push34=, $pop33, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $12
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push38=, $pop37, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $13
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push42=, $pop41, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $14
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push46=, $pop45, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $16
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push54=, $pop53, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $16
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -3811,108 +3151,86 @@ define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v16i8:
 ; NO-SIMD128:         .functype shr_s_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push2=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push2=, $16
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop0
 ; NO-SIMD128-NEXT:    i32.shr_s $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $5
-; NO-SIMD128-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $21, $pop85
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop63
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $3
-; NO-SIMD128-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $19, $pop84
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop62
 ; NO-SIMD128-NEXT:    i32.shr_s $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $2
-; NO-SIMD128-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $18, $pop83
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $13
+; NO-SIMD128-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop61
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop12
-; NO-SIMD128-NEXT:    i32.extend8_s $push14=, $1
-; NO-SIMD128-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $17, $pop82
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push14=, $12
+; NO-SIMD128-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop60
 ; NO-SIMD128-NEXT:    i32.shr_s $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 15
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $16
-; NO-SIMD128-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-NEXT:    i32.and $push16=, $32, $pop81
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $11
+; NO-SIMD128-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop59
 ; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 14
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $15
-; NO-SIMD128-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-NEXT:    i32.and $push21=, $31, $pop80
-; NO-SIMD128-NEXT:    i32.shr_s $push23=, $pop22, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $14
-; NO-SIMD128-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $30, $pop79
-; NO-SIMD128-NEXT:    i32.shr_s $push28=, $pop27, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 12
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $13
-; NO-SIMD128-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $29, $pop78
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $10
+; NO-SIMD128-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop58
+; NO-SIMD128-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $9
+; NO-SIMD128-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop57
+; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push26=, $8
+; NO-SIMD128-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $24, $pop56
+; NO-SIMD128-NEXT:    i32.shr_s $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $7
+; NO-SIMD128-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $23, $pop55
+; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $6
+; NO-SIMD128-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $22, $pop54
 ; NO-SIMD128-NEXT:    i32.shr_s $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push39=, 11
-; NO-SIMD128-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $12
-; NO-SIMD128-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-NEXT:    i32.and $push36=, $28, $pop77
-; NO-SIMD128-NEXT:    i32.shr_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $11
-; NO-SIMD128-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-NEXT:    i32.and $push41=, $27, $pop76
-; NO-SIMD128-NEXT:    i32.shr_s $push43=, $pop42, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $10
-; NO-SIMD128-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-NEXT:    i32.and $push46=, $26, $pop75
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.extend8_s $push35=, $5
+; NO-SIMD128-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $21, $pop53
+; NO-SIMD128-NEXT:    i32.shr_s $push36=, $pop35, $pop34
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.extend8_s $push38=, $4
+; NO-SIMD128-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $20, $pop52
+; NO-SIMD128-NEXT:    i32.shr_s $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $3
+; NO-SIMD128-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-NEXT:    i32.and $push40=, $19, $pop51
+; NO-SIMD128-NEXT:    i32.shr_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $2
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $18, $pop50
+; NO-SIMD128-NEXT:    i32.shr_s $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $1
+; NO-SIMD128-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $17, $pop49
 ; NO-SIMD128-NEXT:    i32.shr_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push54=, 7
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $8
-; NO-SIMD128-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $24, $pop74
-; NO-SIMD128-NEXT:    i32.shr_s $push53=, $pop52, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-NEXT:    i32.const $push59=, 6
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $7
-; NO-SIMD128-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $23, $pop73
-; NO-SIMD128-NEXT:    i32.shr_s $push58=, $pop57, $pop56
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.const $push64=, 5
-; NO-SIMD128-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-NEXT:    i32.extend8_s $push62=, $6
-; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push61=, $22, $pop72
-; NO-SIMD128-NEXT:    i32.shr_s $push63=, $pop62, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-NEXT:    i32.const $push69=, 3
-; NO-SIMD128-NEXT:    i32.add $push70=, $0, $pop69
-; NO-SIMD128-NEXT:    i32.extend8_s $push67=, $4
-; NO-SIMD128-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $20, $pop71
-; NO-SIMD128-NEXT:    i32.shr_s $push68=, $pop67, $pop66
-; NO-SIMD128-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v16i8:
@@ -3924,102 +3242,80 @@ define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop63
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push8=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop62
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $20, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $22, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push26=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push26=, $9
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop56
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push27=, $pop26, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $24, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop29), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $9
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop34, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push39=, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $26, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop39, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $27, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $26, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $27, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push33=, $pop32, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $28, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push38=, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $29, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $30, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push42=, $pop41, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $31, $pop50
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push45=, $pop44, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop42), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $28, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push54=, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $29, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop54, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push59=, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $30, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push60=, $pop59, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop57), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push64=, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $31, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $32, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop70
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push47=, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $32, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push48=, $pop47, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -4042,94 +3338,72 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128:         .functype shr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push71=, $17, $pop72
-; NO-SIMD128-NEXT:    local.tee $push70=, $17=, $pop71
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop70
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-NEXT:    i32.and $push3=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $17, $pop50
+; NO-SIMD128-NEXT:    local.tee $push48=, $17=, $pop49
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop48
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-NEXT:    i32.and $push3=, $15, $pop47
 ; NO-SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop68
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $14, $pop46
 ; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $2, $pop67
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop45
 ; NO-SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-NEXT:    i32.and $push9=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-NEXT:    i32.and $push9=, $12, $pop44
 ; NO-SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $16, $pop65
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $11, $pop43
 ; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-NEXT:    i32.and $push15=, $15, $pop64
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $10, $pop42
+; NO-SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-NEXT:    i32.and $push15=, $9, $pop41
 ; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $14, $pop63
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop39
 ; NO-SIMD128-NEXT:    i32.shr_u $push20=, $pop19, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $13, $pop62
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.shr_u $push22=, $pop21, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $5, $pop37
 ; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $12, $pop61
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.shr_u $push26=, $pop25, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $3, $pop35
 ; NO-SIMD128-NEXT:    i32.shr_u $push28=, $pop27, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $11, $pop60
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $1, $pop33
 ; NO-SIMD128-NEXT:    i32.shr_u $push32=, $pop31, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-NEXT:    i32.and $push35=, $10, $pop59
-; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-NEXT:    i32.and $push39=, $8, $pop58
-; NO-SIMD128-NEXT:    i32.shr_u $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $7, $pop57
-; NO-SIMD128-NEXT:    i32.shr_u $push44=, $pop43, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $6, $pop56
-; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $4, $pop55
-; NO-SIMD128-NEXT:    i32.shr_u $push52=, $pop51, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v16i8:
@@ -4137,93 +3411,71 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $17, $pop72
-; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $1=, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $17, $pop50
+; NO-SIMD128-FAST-NEXT:    local.tee $push48=, $1=, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop48
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push8=, $pop7, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $5, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $5, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push10=, $pop9, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $6, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $6, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $7, $pop42
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push14=, $pop13, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $7, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $9, $pop40
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $8, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $10, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $11, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push22=, $pop21, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $9, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $12, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $13, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push26=, $pop25, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $10, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $14, $pop35
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push28=, $pop27, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $11, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push32=, $pop31, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $12, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $13, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push40=, $pop39, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $14, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push44=, $pop43, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $15, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $16, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push52=, $pop51, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -4440,123 +3692,101 @@ define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128:         .functype shr_u_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop101
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $5, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $21, $pop99
-; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $3, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $19, $pop97
-; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $2, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $18, $pop95
-; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $1, $pop94
-; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $17, $pop93
-; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 15
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push17=, $16, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push16=, $32, $pop91
-; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 14
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push22=, $15, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push21=, $31, $pop89
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $14, $pop88
-; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $30, $pop87
-; NO-SIMD128-NEXT:    i32.shr_u $push28=, $pop27, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 12
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push86=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $13, $pop86
-; NO-SIMD128-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $29, $pop85
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push39=, 11
-; NO-SIMD128-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $12, $pop84
-; NO-SIMD128-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-NEXT:    i32.and $push36=, $28, $pop83
-; NO-SIMD128-NEXT:    i32.shr_u $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-NEXT:    i32.and $push42=, $11, $pop82
-; NO-SIMD128-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-NEXT:    i32.and $push41=, $27, $pop81
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $10, $pop80
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-NEXT:    i32.and $push46=, $26, $pop79
-; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push54=, 7
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop79
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-NEXT:    i32.and $push52=, $8, $pop78
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop78
 ; NO-SIMD128-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $24, $pop77
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-NEXT:    i32.const $push59=, 6
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop77
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-NEXT:    i32.and $push57=, $7, $pop76
+; NO-SIMD128-NEXT:    i32.and $push8=, $14, $pop76
 ; NO-SIMD128-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $23, $pop75
-; NO-SIMD128-NEXT:    i32.shr_u $push58=, $pop57, $pop56
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.const $push64=, 5
-; NO-SIMD128-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop75
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-NEXT:    i32.and $push62=, $6, $pop74
+; NO-SIMD128-NEXT:    i32.and $push11=, $13, $pop74
 ; NO-SIMD128-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-NEXT:    i32.and $push61=, $22, $pop73
-; NO-SIMD128-NEXT:    i32.shr_u $push63=, $pop62, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-NEXT:    i32.const $push69=, 3
-; NO-SIMD128-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop73
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push67=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.and $push14=, $12, $pop72
 ; NO-SIMD128-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $20, $pop71
-; NO-SIMD128-NEXT:    i32.shr_u $push68=, $pop67, $pop66
-; NO-SIMD128-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop71
+; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $11, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop69
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $10, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop67
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $9, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop65
+; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $8, $pop64
+; NO-SIMD128-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $24, $pop63
+; NO-SIMD128-NEXT:    i32.shr_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $7, $pop62
+; NO-SIMD128-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $23, $pop61
+; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-NEXT:    i32.and $push32=, $6, $pop60
+; NO-SIMD128-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $22, $pop59
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop31
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-NEXT:    i32.and $push35=, $5, $pop58
+; NO-SIMD128-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $21, $pop57
+; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $pop34
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $4, $pop56
+; NO-SIMD128-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $20, $pop55
+; NO-SIMD128-NEXT:    i32.shr_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $3, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-NEXT:    i32.and $push40=, $19, $pop53
+; NO-SIMD128-NEXT:    i32.shr_u $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-NEXT:    i32.and $push44=, $2, $pop52
+; NO-SIMD128-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $18, $pop51
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $1, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $17, $pop49
+; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $pop46
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v16i8:
@@ -4564,122 +3794,100 @@ define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $22, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $23, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $24, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop86
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $26, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push38=, $pop37, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $12, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $28, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
 ; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $13, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $29, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $14, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop76
 ; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $30, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $15, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop74
 ; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $31, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push67=, $16, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop72
 ; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push66=, $32, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push68=, $pop67, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $10, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $26, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $11, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $27, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $12, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $28, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $13, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $29, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $14, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $30, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push42=, $pop41, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $15, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $31, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $16, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $32, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -4701,60 +3909,38 @@ define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: and_v16i8:
 ; NO-SIMD128:         .functype and_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.and $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.and $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.and $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.and $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.and $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.and $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.and $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.and $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.and $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.and $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.and $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.and $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.and $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.and $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.and $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.and $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.and $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v16i8:
@@ -4766,54 +3952,32 @@ define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -4835,60 +3999,38 @@ define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: or_v16i8:
 ; NO-SIMD128:         .functype or_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.or $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.or $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.or $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.or $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.or $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.or $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.or $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.or $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.or $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.or $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.or $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.or $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.or $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.or $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.or $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.or $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.or $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.or $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.or $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.or $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.or $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.or $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.or $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.or $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.or $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.or $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v16i8:
@@ -4900,54 +4042,32 @@ define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.or $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.or $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.or $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.or $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.or $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.or $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.or $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.or $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.or $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -4969,60 +4089,38 @@ define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: xor_v16i8:
 ; NO-SIMD128:         .functype xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.xor $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.xor $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.xor $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.xor $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.xor $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.xor $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.xor $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.xor $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.xor $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v16i8:
@@ -5034,54 +4132,32 @@ define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -5104,75 +4180,53 @@ define <16 x i8> @not_v16i8(<16 x i8> %x) {
 ; NO-SIMD128:         .functype not_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $5, $pop53
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $3, $pop52
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $2, $pop51
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $1, $pop50
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $16, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $15, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push12=, $14, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $13, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop45
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push21=, $11, $pop44
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push24=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push27=, $8, $pop42
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $7, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push40=, -1
-; NO-SIMD128-NEXT:    i32.xor $push33=, $6, $pop40
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-NEXT:    i32.xor $push36=, $4, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $15, $pop31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $14, $pop30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $13, $pop29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $12, $pop28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $8, $pop24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $7, $pop23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $6, $pop22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push12=, $5, $pop21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push16=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v16i8:
@@ -5181,73 +4235,51 @@ define <16 x i8> @not_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $5, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $6, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $7, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $8, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $9, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $10, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $11, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $12, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $13, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $14, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $15, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $16, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $7, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $9, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $10, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $11, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <16 x i8> %x, <i8 -1, i8 -1, i8 -1, i8 -1,
                           i8 -1, i8 -1, i8 -1, i8 -1,
@@ -5274,91 +4306,69 @@ define <16 x i8> @andnot_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128:         .functype andnot_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $25, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $9, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $21, $pop69
-; NO-SIMD128-NEXT:    i32.and $push4=, $5, $pop3
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $19, $pop68
-; NO-SIMD128-NEXT:    i32.and $push6=, $3, $pop5
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $18, $pop67
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop7
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $17, $pop66
-; NO-SIMD128-NEXT:    i32.and $push10=, $1, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $32, $pop65
-; NO-SIMD128-NEXT:    i32.and $push12=, $16, $pop11
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $31, $pop64
-; NO-SIMD128-NEXT:    i32.and $push16=, $15, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $30, $pop63
-; NO-SIMD128-NEXT:    i32.and $push20=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, -1
-; NO-SIMD128-NEXT:    i32.xor $push23=, $29, $pop62
-; NO-SIMD128-NEXT:    i32.and $push24=, $13, $pop23
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, -1
-; NO-SIMD128-NEXT:    i32.xor $push27=, $28, $pop61
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, -1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $27, $pop60
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, -1
-; NO-SIMD128-NEXT:    i32.xor $push35=, $26, $pop59
-; NO-SIMD128-NEXT:    i32.and $push36=, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, -1
-; NO-SIMD128-NEXT:    i32.xor $push39=, $24, $pop58
-; NO-SIMD128-NEXT:    i32.and $push40=, $8, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, -1
-; NO-SIMD128-NEXT:    i32.xor $push43=, $23, $pop57
-; NO-SIMD128-NEXT:    i32.and $push44=, $7, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, -1
-; NO-SIMD128-NEXT:    i32.xor $push47=, $22, $pop56
-; NO-SIMD128-NEXT:    i32.and $push48=, $6, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, -1
-; NO-SIMD128-NEXT:    i32.xor $push51=, $20, $pop55
-; NO-SIMD128-NEXT:    i32.and $push52=, $4, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.xor $push1=, $32, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $pop47
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $30, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $14, $pop5
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $29, $pop45
+; NO-SIMD128-NEXT:    i32.and $push8=, $13, $pop7
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $28, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $12, $pop9
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $27, $pop43
+; NO-SIMD128-NEXT:    i32.and $push12=, $11, $pop11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $26, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $10, $pop13
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $25, $pop41
+; NO-SIMD128-NEXT:    i32.and $push16=, $9, $pop15
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $24, $pop40
+; NO-SIMD128-NEXT:    i32.and $push18=, $8, $pop17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push19=, $23, $pop39
+; NO-SIMD128-NEXT:    i32.and $push20=, $7, $pop19
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $22, $pop38
+; NO-SIMD128-NEXT:    i32.and $push22=, $6, $pop21
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $21, $pop37
+; NO-SIMD128-NEXT:    i32.and $push24=, $5, $pop23
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $20, $pop36
+; NO-SIMD128-NEXT:    i32.and $push26=, $4, $pop25
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push27=, $19, $pop35
+; NO-SIMD128-NEXT:    i32.and $push28=, $3, $pop27
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $18, $pop34
+; NO-SIMD128-NEXT:    i32.and $push30=, $2, $pop29
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push31=, $17, $pop33
+; NO-SIMD128-NEXT:    i32.and $push32=, $1, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v16i8:
@@ -5368,88 +4378,66 @@ define <16 x i8> @andnot_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $17, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $19, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $19, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $21, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $23, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $24, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $25, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $26, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $10, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $27, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $11, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $29, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $30, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $31, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $15, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push53=, $32, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $16, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $20, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $21, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $22, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $23, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $24, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $25, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $10, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $11, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $12, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $29, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $13, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $30, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $31, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $15, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $32, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <16 x i8> %y,
    <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -5477,124 +4465,102 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v16i8:
 ; NO-SIMD128:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $48
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $pop101
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $47
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop7, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $14, $pop100
-; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $46
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $pop79
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $47
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $pop78
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $46
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $pop77
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $45
 ; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-NEXT:    i32.xor $push20=, $13, $pop99
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $45
-; NO-SIMD128-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.and $push25=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-NEXT:    i32.xor $push26=, $12, $pop98
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $44
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop76
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $44
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $pop75
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $43
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $pop74
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $42
 ; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.and $push31=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-NEXT:    i32.xor $push32=, $11, $pop97
-; NO-SIMD128-NEXT:    i32.and $push33=, $pop32, $43
-; NO-SIMD128-NEXT:    i32.or $push34=, $pop31, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.and $push37=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $10, $pop96
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $41
+; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push33=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-NEXT:    i32.xor $push34=, $8, $pop72
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $40
+; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push37=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-NEXT:    i32.xor $push38=, $7, $pop71
+; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $39
 ; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.and $push43=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-NEXT:    i32.xor $push44=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.and $push45=, $pop44, $41
-; NO-SIMD128-NEXT:    i32.or $push46=, $pop43, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.and $push47=, $8, $24
-; NO-SIMD128-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-NEXT:    i32.xor $push48=, $8, $pop94
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $40
-; NO-SIMD128-NEXT:    i32.or $push50=, $pop47, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.and $push53=, $7, $23
-; NO-SIMD128-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-NEXT:    i32.xor $push54=, $7, $pop93
-; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push41=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-NEXT:    i32.xor $push42=, $6, $pop70
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $38
+; NO-SIMD128-NEXT:    i32.or $push44=, $pop41, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.and $push45=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push46=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $37
+; NO-SIMD128-NEXT:    i32.or $push48=, $pop45, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.and $push49=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push50=, $4, $pop68
+; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $36
+; NO-SIMD128-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.and $push53=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push54=, $3, $pop67
+; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $35
 ; NO-SIMD128-NEXT:    i32.or $push56=, $pop53, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.and $push59=, $6, $22
-; NO-SIMD128-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-NEXT:    i32.xor $push60=, $6, $pop92
-; NO-SIMD128-NEXT:    i32.and $push61=, $pop60, $38
-; NO-SIMD128-NEXT:    i32.or $push62=, $pop59, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.and $push65=, $5, $21
-; NO-SIMD128-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-NEXT:    i32.xor $push66=, $5, $pop91
-; NO-SIMD128-NEXT:    i32.and $push67=, $pop66, $37
-; NO-SIMD128-NEXT:    i32.or $push68=, $pop65, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-NEXT:    i32.and $push69=, $4, $20
-; NO-SIMD128-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-NEXT:    i32.xor $push70=, $4, $pop90
-; NO-SIMD128-NEXT:    i32.and $push71=, $pop70, $36
-; NO-SIMD128-NEXT:    i32.or $push72=, $pop69, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-NEXT:    i32.and $push75=, $3, $19
-; NO-SIMD128-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-NEXT:    i32.xor $push76=, $3, $pop89
-; NO-SIMD128-NEXT:    i32.and $push77=, $pop76, $35
-; NO-SIMD128-NEXT:    i32.or $push78=, $pop75, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
-; NO-SIMD128-NEXT:    i32.and $push79=, $2, $18
-; NO-SIMD128-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-NEXT:    i32.xor $push80=, $2, $pop88
-; NO-SIMD128-NEXT:    i32.and $push81=, $pop80, $34
-; NO-SIMD128-NEXT:    i32.or $push82=, $pop79, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
-; NO-SIMD128-NEXT:    i32.and $push83=, $1, $17
-; NO-SIMD128-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-NEXT:    i32.xor $push84=, $1, $pop87
-; NO-SIMD128-NEXT:    i32.and $push85=, $pop84, $33
-; NO-SIMD128-NEXT:    i32.or $push86=, $pop83, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.and $push57=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push58=, $2, $pop66
+; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $34
+; NO-SIMD128-NEXT:    i32.or $push60=, $pop57, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.and $push61=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push62=, $1, $pop65
+; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $33
+; NO-SIMD128-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v16i8:
@@ -5607,117 +4573,95 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $34
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $35
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop77
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $36
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $5, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $37
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $6, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $38
-; NO-SIMD128-FAST-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $7, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $37
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $38
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $39
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $40
 ; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $8, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $40
-; NO-SIMD128-FAST-NEXT:    i32.or $push38=, $pop35, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $9, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $9, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $41
+; NO-SIMD128-FAST-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $10, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-FAST-NEXT:    i32.or $push40=, $pop37, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $11, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $43
 ; NO-SIMD128-FAST-NEXT:    i32.or $push44=, $pop41, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $10, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $44
 ; NO-SIMD128-FAST-NEXT:    i32.or $push48=, $pop45, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $11, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $43
-; NO-SIMD128-FAST-NEXT:    i32.or $push54=, $pop51, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $12, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $13, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $45
+; NO-SIMD128-FAST-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $14, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $46
+; NO-SIMD128-FAST-NEXT:    i32.or $push56=, $pop53, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $15, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $47
 ; NO-SIMD128-FAST-NEXT:    i32.or $push60=, $pop57, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $13, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.and $push65=, $pop64, $45
-; NO-SIMD128-FAST-NEXT:    i32.or $push66=, $pop63, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push70=, $14, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $pop70, $46
-; NO-SIMD128-FAST-NEXT:    i32.or $push72=, $pop69, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push76=, $15, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $pop76, $47
-; NO-SIMD128-FAST-NEXT:    i32.or $push78=, $pop75, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push82=, $16, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.and $push83=, $pop82, $48
-; NO-SIMD128-FAST-NEXT:    i32.or $push84=, $pop81, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $16, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $48
+; NO-SIMD128-FAST-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <16 x i8> %c, %v1
   %inv_mask = xor <16 x i8> %c,
@@ -5746,92 +4690,70 @@ define <16 x i8> @bitselect_xor_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v16i8:
 ; NO-SIMD128:         .functype bitselect_xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 15
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $32, $48
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $16
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $48
-; NO-SIMD128-NEXT:    i32.store8 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push5=, $31, $47
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $15
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $47
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push10=, $30, $46
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $14
-; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $46
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 12
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push15=, $29, $45
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $13
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $45
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push20=, $28, $44
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $12
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $44
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push28=, 10
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.xor $push25=, $27, $43
-; NO-SIMD128-NEXT:    i32.and $push26=, $pop25, $11
-; NO-SIMD128-NEXT:    i32.xor $push27=, $pop26, $43
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push33=, 9
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push30=, $26, $42
-; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $10
-; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $42
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.xor $push35=, $25, $41
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $9
-; NO-SIMD128-NEXT:    i32.xor $push37=, $pop36, $41
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop37
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.xor $push38=, $24, $40
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $8
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $40
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push46=, 6
-; NO-SIMD128-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-NEXT:    i32.xor $push43=, $23, $39
-; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $7
-; NO-SIMD128-NEXT:    i32.xor $push45=, $pop44, $39
-; NO-SIMD128-NEXT:    i32.store8 0($pop47), $pop45
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.xor $push48=, $22, $38
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $6
-; NO-SIMD128-NEXT:    i32.xor $push50=, $pop49, $38
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.xor $push53=, $21, $37
-; NO-SIMD128-NEXT:    i32.and $push54=, $pop53, $5
-; NO-SIMD128-NEXT:    i32.xor $push55=, $pop54, $37
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop55
-; NO-SIMD128-NEXT:    i32.const $push59=, 3
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.xor $push56=, $20, $36
-; NO-SIMD128-NEXT:    i32.and $push57=, $pop56, $4
-; NO-SIMD128-NEXT:    i32.xor $push58=, $pop57, $36
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.xor $push61=, $19, $35
-; NO-SIMD128-NEXT:    i32.and $push62=, $pop61, $3
-; NO-SIMD128-NEXT:    i32.xor $push63=, $pop62, $35
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop63
-; NO-SIMD128-NEXT:    i32.xor $push64=, $18, $34
-; NO-SIMD128-NEXT:    i32.and $push65=, $pop64, $2
-; NO-SIMD128-NEXT:    i32.xor $push66=, $pop65, $34
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop66
-; NO-SIMD128-NEXT:    i32.xor $push67=, $17, $33
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $1
-; NO-SIMD128-NEXT:    i32.xor $push69=, $pop68, $33
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop69
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $47
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $15
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $30, $46
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $14
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $29, $45
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $13
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $28, $44
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $27, $43
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $11
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $26, $42
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $10
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $25, $41
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $9
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop23
+; NO-SIMD128-NEXT:    i32.xor $push24=, $24, $40
+; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop26
+; NO-SIMD128-NEXT:    i32.xor $push27=, $23, $39
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $7
+; NO-SIMD128-NEXT:    i32.xor $push29=, $pop28, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-NEXT:    i32.xor $push30=, $22, $38
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $6
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push33=, $21, $37
+; NO-SIMD128-NEXT:    i32.and $push34=, $pop33, $5
+; NO-SIMD128-NEXT:    i32.xor $push35=, $pop34, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push36=, $20, $36
+; NO-SIMD128-NEXT:    i32.and $push37=, $pop36, $4
+; NO-SIMD128-NEXT:    i32.xor $push38=, $pop37, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop38
+; NO-SIMD128-NEXT:    i32.xor $push39=, $19, $35
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $3
+; NO-SIMD128-NEXT:    i32.xor $push41=, $pop40, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop41
+; NO-SIMD128-NEXT:    i32.xor $push42=, $18, $34
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $2
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push45=, $17, $33
+; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $1
+; NO-SIMD128-NEXT:    i32.xor $push47=, $pop46, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop47
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v16i8:
@@ -5849,80 +4771,58 @@ define <16 x i8> @bitselect_xor_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $35
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $20, $36
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $21, $37
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $37
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $22, $38
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $pop20, $38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $23, $39
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $24, $40
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $pop29, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $pop30, $40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $25, $41
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $pop33, $41
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $26, $42
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $pop38, $42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $27, $43
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $11
-; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop41), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $28, $44
-; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $12
-; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $pop48, $44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop46), $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $29, $45
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $pop53, $45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop51), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $30, $46
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $pop57, $14
-; NO-SIMD128-FAST-NEXT:    i32.xor $push59=, $pop58, $46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $31, $47
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $pop63, $47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop64
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $0, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.xor $push67=, $32, $48
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push69=, $pop68, $48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop66), $pop69
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $36
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $36
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $pop28, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $pop34, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $pop40, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $pop45, $16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $pop46, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop47
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <16 x i8> %v1, %v2
  %and = and <16 x i8> %xor1, %c
@@ -5949,124 +4849,102 @@ define <16 x i8> @bitselect_xor_reversed_v16i8(<16 x i8> %c, <16 x i8> %v1, <16
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v16i8:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $32, $48
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $48
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push8=, $31, $47
-; NO-SIMD128-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $15, $pop101
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $47
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.xor $push14=, $30, $46
-; NO-SIMD128-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $14, $pop100
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $31, $47
+; NO-SIMD128-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $pop79
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $30, $46
+; NO-SIMD128-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $14, $pop78
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $29, $45
+; NO-SIMD128-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $13, $pop77
 ; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $46
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push20=, $29, $45
-; NO-SIMD128-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $13, $pop99
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $45
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.xor $push26=, $28, $44
-; NO-SIMD128-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-NEXT:    i32.xor $push25=, $12, $pop98
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push18=, $28, $44
+; NO-SIMD128-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $12, $pop76
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push22=, $27, $43
+; NO-SIMD128-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $11, $pop75
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push24=, $pop23, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.xor $push26=, $26, $42
+; NO-SIMD128-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $10, $pop74
 ; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $44
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.xor $push32=, $27, $43
-; NO-SIMD128-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $11, $pop97
-; NO-SIMD128-NEXT:    i32.and $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.xor $push34=, $pop33, $43
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.xor $push38=, $26, $42
-; NO-SIMD128-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-NEXT:    i32.xor $push37=, $10, $pop96
+; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.xor $push30=, $25, $41
+; NO-SIMD128-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push34=, $24, $40
+; NO-SIMD128-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-NEXT:    i32.xor $push33=, $8, $pop72
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.xor $push36=, $pop35, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.xor $push38=, $23, $39
+; NO-SIMD128-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-NEXT:    i32.xor $push37=, $7, $pop71
 ; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $42
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.xor $push44=, $25, $41
-; NO-SIMD128-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-NEXT:    i32.xor $push43=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.and $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.xor $push46=, $pop45, $41
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.xor $push48=, $24, $40
-; NO-SIMD128-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-NEXT:    i32.xor $push47=, $8, $pop94
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.xor $push50=, $pop49, $40
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.xor $push54=, $23, $39
-; NO-SIMD128-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-NEXT:    i32.xor $push53=, $7, $pop93
+; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.xor $push42=, $22, $38
+; NO-SIMD128-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-NEXT:    i32.xor $push41=, $6, $pop70
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push46=, $21, $37
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push45=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.xor $push48=, $pop47, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.xor $push50=, $20, $36
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push49=, $4, $pop68
+; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.xor $push52=, $pop51, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.xor $push54=, $19, $35
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push53=, $3, $pop67
 ; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.xor $push56=, $pop55, $39
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.xor $push60=, $22, $38
-; NO-SIMD128-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-NEXT:    i32.xor $push59=, $6, $pop92
-; NO-SIMD128-NEXT:    i32.and $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.xor $push62=, $pop61, $38
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.xor $push66=, $21, $37
-; NO-SIMD128-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-NEXT:    i32.xor $push65=, $5, $pop91
-; NO-SIMD128-NEXT:    i32.and $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.xor $push68=, $pop67, $37
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-NEXT:    i32.xor $push70=, $20, $36
-; NO-SIMD128-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-NEXT:    i32.xor $push69=, $4, $pop90
-; NO-SIMD128-NEXT:    i32.and $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.xor $push72=, $pop71, $36
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-NEXT:    i32.xor $push76=, $19, $35
-; NO-SIMD128-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-NEXT:    i32.xor $push75=, $3, $pop89
-; NO-SIMD128-NEXT:    i32.and $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.xor $push78=, $pop77, $35
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
-; NO-SIMD128-NEXT:    i32.xor $push80=, $18, $34
-; NO-SIMD128-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-NEXT:    i32.xor $push79=, $2, $pop88
-; NO-SIMD128-NEXT:    i32.and $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.xor $push82=, $pop81, $34
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
-; NO-SIMD128-NEXT:    i32.xor $push84=, $17, $33
-; NO-SIMD128-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-NEXT:    i32.xor $push83=, $1, $pop87
-; NO-SIMD128-NEXT:    i32.and $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.xor $push86=, $pop85, $33
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.xor $push56=, $pop55, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.xor $push58=, $18, $34
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push57=, $2, $pop66
+; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.xor $push60=, $pop59, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.xor $push62=, $17, $33
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push61=, $1, $pop65
+; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.xor $push64=, $pop63, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v16i8:
@@ -6079,117 +4957,95 @@ define <16 x i8> @bitselect_xor_reversed_v16i8(<16 x i8> %c, <16 x i8> %v1, <16
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $33
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $18, $34
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $34
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $19, $35
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $35
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $20, $36
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop77
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $21, $37
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $5, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $pop21, $37
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $22, $38
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $6, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $23, $39
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $7, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $5, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $6, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $pop23, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $7, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $pop27, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $24, $40
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $8, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $25, $41
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $9, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $9, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $pop35, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $10, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.xor $push40=, $pop39, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $11, $pop70
 ; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $41
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $26, $42
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $10, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $12, $pop69
 ; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.xor $push48=, $pop47, $42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $27, $43
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push51=, $11, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $pop53, $43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $28, $44
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $12, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.xor $push48=, $pop47, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $13, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $pop51, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push53=, $14, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.xor $push56=, $pop55, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $15, $pop66
 ; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.xor $push60=, $pop59, $44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $29, $45
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push63=, $13, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.and $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.xor $push66=, $pop65, $45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.xor $push70=, $30, $46
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push69=, $14, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.xor $push72=, $pop71, $46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.xor $push76=, $31, $47
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push75=, $15, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.xor $push78=, $pop77, $47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.xor $push82=, $32, $48
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push81=, $16, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.and $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.xor $push84=, $pop83, $48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.xor $push60=, $pop59, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push61=, $16, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $pop63, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <16 x i8> %v1, %v2
  %notc = xor <16 x i8> %c, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -6218,30 +5074,22 @@ define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: add_v8i16:
 ; NO-SIMD128:         .functype add_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.add $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.add $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.add $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.add $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.add $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v8i16:
@@ -6253,24 +5101,16 @@ define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6292,30 +5132,22 @@ define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: sub_v8i16:
 ; NO-SIMD128:         .functype sub_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.sub $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.sub $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.sub $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.sub $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.sub $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.sub $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.sub $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.sub $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v8i16:
@@ -6327,24 +5159,16 @@ define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6366,30 +5190,22 @@ define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: mul_v8i16:
 ; NO-SIMD128:         .functype mul_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.mul $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.mul $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.mul $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.mul $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.mul $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.mul $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.mul $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.mul $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.mul $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v8i16:
@@ -6401,24 +5217,16 @@ define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.mul $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6440,54 +5248,46 @@ define <8 x i16> @min_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: min_s_v8i16:
 ; NO-SIMD128:         .functype min_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.lt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $16, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.lt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $15, $pop8
-; NO-SIMD128-NEXT:    i32.store16 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 10
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $15, $pop6
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.lt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $14, $pop10
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $13
 ; NO-SIMD128-NEXT:    i32.lt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $6, $14, $pop14
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $12
-; NO-SIMD128-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.select $push25=, $4, $12, $pop24
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $11
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $13, $pop14
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $12
+; NO-SIMD128-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $4, $12, $pop18
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $11
+; NO-SIMD128-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $3, $11, $pop22
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend16_s $push25=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $10
+; NO-SIMD128-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-NEXT:    i32.select $push27=, $2, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $9
 ; NO-SIMD128-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.select $push31=, $3, $11, $pop30
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push33=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push32=, $10
-; NO-SIMD128-NEXT:    i32.lt_s $push34=, $pop33, $pop32
-; NO-SIMD128-NEXT:    i32.select $push35=, $2, $10, $pop34
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop35
-; NO-SIMD128-NEXT:    i32.extend16_s $push37=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.lt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $1, $9, $pop38
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop39
+; NO-SIMD128-NEXT:    i32.select $push31=, $1, $9, $pop30
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v8i16:
@@ -6508,39 +5308,31 @@ define <8 x i16> @min_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $11, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $12, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $14
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $14, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $13
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $13, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $14
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $14, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push24=, $15
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $16
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $15, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push34=, $16
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $16, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $16, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop31
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6563,70 +5355,62 @@ define <8 x i16> @min_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: min_u_v8i16:
 ; NO-SIMD128:         .functype min_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
-; NO-SIMD128-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop55
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop47
 ; NO-SIMD128-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $8, $16, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop54
-; NO-SIMD128-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop53
-; NO-SIMD128-NEXT:    i32.lt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $7, $15, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $6, $pop52
-; NO-SIMD128-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop51
-; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $6, $14, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-NEXT:    i32.and $push20=, $5, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $pop49
-; NO-SIMD128-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $4, $pop48
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop47
-; NO-SIMD128-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.select $push26=, $4, $12, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push30=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $7, $pop46
 ; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.select $push32=, $3, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop45
+; NO-SIMD128-NEXT:    i32.lt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $7, $15, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push34=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $pop44
 ; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.lt_u $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.select $push36=, $2, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $pop43
+; NO-SIMD128-NEXT:    i32.lt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $6, $14, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $5, $pop42
 ; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $1, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $pop41
+; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $5, $13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $4, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $pop39
+; NO-SIMD128-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $4, $12, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $3, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $pop37
+; NO-SIMD128-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $3, $11, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-NEXT:    i32.and $push26=, $2, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $pop35
+; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $2, $10, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push30=, $1, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $pop33
+; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $1, $9, $pop31
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v8i16:
@@ -6634,68 +5418,60 @@ define <8 x i16> @min_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $9, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $10, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $11, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop41
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $12, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $14, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $13, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $14, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $15, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $16, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6718,54 +5494,46 @@ define <8 x i16> @max_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: max_s_v8i16:
 ; NO-SIMD128:         .functype max_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.gt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $16, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.gt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $15, $pop8
-; NO-SIMD128-NEXT:    i32.store16 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 10
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $15, $pop6
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.gt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $14, $pop10
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $13
 ; NO-SIMD128-NEXT:    i32.gt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $6, $14, $pop14
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $12
-; NO-SIMD128-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.select $push25=, $4, $12, $pop24
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $11
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $13, $pop14
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $12
+; NO-SIMD128-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $4, $12, $pop18
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $11
+; NO-SIMD128-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $3, $11, $pop22
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend16_s $push25=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $10
+; NO-SIMD128-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-NEXT:    i32.select $push27=, $2, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $9
 ; NO-SIMD128-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.select $push31=, $3, $11, $pop30
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push33=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push32=, $10
-; NO-SIMD128-NEXT:    i32.gt_s $push34=, $pop33, $pop32
-; NO-SIMD128-NEXT:    i32.select $push35=, $2, $10, $pop34
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop35
-; NO-SIMD128-NEXT:    i32.extend16_s $push37=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.gt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $1, $9, $pop38
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop39
+; NO-SIMD128-NEXT:    i32.select $push31=, $1, $9, $pop30
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v8i16:
@@ -6786,39 +5554,31 @@ define <8 x i16> @max_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $11, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $12, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $14
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $14, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $13
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $13, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $14
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $14, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push24=, $15
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $16
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $15, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push34=, $16
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $16, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $16, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop31
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6841,70 +5601,62 @@ define <8 x i16> @max_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: max_u_v8i16:
 ; NO-SIMD128:         .functype max_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
-; NO-SIMD128-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop55
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop47
 ; NO-SIMD128-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $8, $16, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop54
-; NO-SIMD128-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop53
-; NO-SIMD128-NEXT:    i32.gt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $7, $15, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $6, $pop52
-; NO-SIMD128-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop51
-; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $6, $14, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-NEXT:    i32.and $push20=, $5, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $pop49
-; NO-SIMD128-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $4, $pop48
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop47
-; NO-SIMD128-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.select $push26=, $4, $12, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push30=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $7, $pop46
 ; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.select $push32=, $3, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop45
+; NO-SIMD128-NEXT:    i32.gt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $7, $15, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push34=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $pop44
 ; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.gt_u $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.select $push36=, $2, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $pop43
+; NO-SIMD128-NEXT:    i32.gt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $6, $14, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $5, $pop42
 ; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $1, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $pop41
+; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $5, $13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $4, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $pop39
+; NO-SIMD128-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $4, $12, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $3, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $pop37
+; NO-SIMD128-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $3, $11, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-NEXT:    i32.and $push26=, $2, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $pop35
+; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $2, $10, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push30=, $1, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $pop33
+; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $1, $9, $pop31
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v8i16:
@@ -6912,68 +5664,60 @@ define <8 x i16> @max_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $9, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $10, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $11, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop41
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $12, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $14, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $13, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $14, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $15, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $16, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6996,78 +5740,70 @@ define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v8i16:
 ; NO-SIMD128:         .functype avgr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $8, $16
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 65534
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop63
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $7, $15
-; NO-SIMD128-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop62
-; NO-SIMD128-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop61
-; NO-SIMD128-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop60
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $6, $14
-; NO-SIMD128-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop59
-; NO-SIMD128-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop58
-; NO-SIMD128-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop57
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-NEXT:    i32.add $push21=, $pop20, $pop56
-; NO-SIMD128-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $pop55
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 65534
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop55
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $7, $15
 ; NO-SIMD128-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop54
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-NEXT:    i32.const $push24=, 6
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push26=, $4, $12
-; NO-SIMD128-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop53
-; NO-SIMD128-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop52
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop52
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
 ; NO-SIMD128-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop51
-; NO-SIMD128-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-NEXT:    i32.add $push30=, $3, $11
-; NO-SIMD128-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop49
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $5, $13
 ; NO-SIMD128-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop48
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop33
-; NO-SIMD128-NEXT:    i32.add $push34=, $2, $10
-; NO-SIMD128-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
-; NO-SIMD128-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $4, $12
 ; NO-SIMD128-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop45
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop37
-; NO-SIMD128-NEXT:    i32.add $push38=, $1, $9
-; NO-SIMD128-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $3, $11
 ; NO-SIMD128-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $2, $10
+; NO-SIMD128-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $1, $9
+; NO-SIMD128-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop33
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v8i16:
@@ -7078,73 +5814,65 @@ define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 65534
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop55
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $13
 ; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $14
 ; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop33
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add nuw <8 x i16> %x, %y
   %b = add nuw <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -7176,78 +5904,70 @@ define <8 x i16> @avgr_u_v8i16_wrap(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v8i16_wrap:
 ; NO-SIMD128:         .functype avgr_u_v8i16_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $8, $16
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 65534
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop63
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $7, $15
-; NO-SIMD128-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop62
-; NO-SIMD128-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop61
-; NO-SIMD128-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop60
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $6, $14
-; NO-SIMD128-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop59
-; NO-SIMD128-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop58
-; NO-SIMD128-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop57
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-NEXT:    i32.add $push21=, $pop20, $pop56
-; NO-SIMD128-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $pop55
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 65534
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop55
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $7, $15
 ; NO-SIMD128-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop54
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-NEXT:    i32.const $push24=, 6
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push26=, $4, $12
-; NO-SIMD128-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop53
-; NO-SIMD128-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop52
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop52
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
 ; NO-SIMD128-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop51
-; NO-SIMD128-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-NEXT:    i32.add $push30=, $3, $11
-; NO-SIMD128-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop49
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $5, $13
 ; NO-SIMD128-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop48
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop33
-; NO-SIMD128-NEXT:    i32.add $push34=, $2, $10
-; NO-SIMD128-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
-; NO-SIMD128-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $4, $12
 ; NO-SIMD128-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop45
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop37
-; NO-SIMD128-NEXT:    i32.add $push38=, $1, $9
-; NO-SIMD128-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $3, $11
 ; NO-SIMD128-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $2, $10
+; NO-SIMD128-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $1, $9
+; NO-SIMD128-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop33
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v8i16_wrap:
@@ -7258,73 +5978,65 @@ define <8 x i16> @avgr_u_v8i16_wrap(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 65534
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop55
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $13
 ; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $14
 ; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop33
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <8 x i16> %x, %y
   %b = add <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -7348,70 +6060,62 @@ define <8 x i16> @abs_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-LABEL: abs_v8i16:
 ; NO-SIMD128:         .functype abs_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $8
 ; NO-SIMD128-NEXT:    i32.const $push1=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push55=, $pop0, $pop1
-; NO-SIMD128-NEXT:    local.tee $push54=, $9=, $pop55
-; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop54
+; NO-SIMD128-NEXT:    i32.shr_s $push47=, $pop0, $pop1
+; NO-SIMD128-NEXT:    local.tee $push46=, $9=, $pop47
+; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop46
 ; NO-SIMD128-NEXT:    i32.sub $push3=, $pop2, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $7
-; NO-SIMD128-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push52=, $pop6, $pop53
-; NO-SIMD128-NEXT:    local.tee $push51=, $8=, $pop52
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $pop51
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $6
-; NO-SIMD128-NEXT:    i32.const $push50=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push49=, $pop11, $pop50
-; NO-SIMD128-NEXT:    local.tee $push48=, $8=, $pop49
-; NO-SIMD128-NEXT:    i32.xor $push12=, $6, $pop48
-; NO-SIMD128-NEXT:    i32.sub $push13=, $pop12, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
-; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $5
-; NO-SIMD128-NEXT:    i32.const $push47=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push46=, $pop16, $pop47
-; NO-SIMD128-NEXT:    local.tee $push45=, $8=, $pop46
-; NO-SIMD128-NEXT:    i32.xor $push17=, $5, $pop45
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.const $push45=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push44=, $pop4, $pop45
+; NO-SIMD128-NEXT:    local.tee $push43=, $8=, $pop44
+; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $pop43
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.const $push42=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push41=, $pop7, $pop42
+; NO-SIMD128-NEXT:    local.tee $push40=, $8=, $pop41
+; NO-SIMD128-NEXT:    i32.xor $push8=, $6, $pop40
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop8, $8
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.const $push39=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push38=, $pop10, $pop39
+; NO-SIMD128-NEXT:    local.tee $push37=, $8=, $pop38
+; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $pop37
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop11, $8
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $4
+; NO-SIMD128-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push35=, $pop13, $pop36
+; NO-SIMD128-NEXT:    local.tee $push34=, $8=, $pop35
+; NO-SIMD128-NEXT:    i32.xor $push14=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop14, $8
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $3
+; NO-SIMD128-NEXT:    i32.const $push33=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push32=, $pop16, $pop33
+; NO-SIMD128-NEXT:    local.tee $push31=, $8=, $pop32
+; NO-SIMD128-NEXT:    i32.xor $push17=, $3, $pop31
 ; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $8
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 6
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $4
-; NO-SIMD128-NEXT:    i32.const $push44=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push43=, $pop19, $pop44
-; NO-SIMD128-NEXT:    local.tee $push42=, $8=, $pop43
-; NO-SIMD128-NEXT:    i32.xor $push20=, $4, $pop42
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $2
+; NO-SIMD128-NEXT:    i32.const $push30=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push29=, $pop19, $pop30
+; NO-SIMD128-NEXT:    local.tee $push28=, $8=, $pop29
+; NO-SIMD128-NEXT:    i32.xor $push20=, $2, $pop28
 ; NO-SIMD128-NEXT:    i32.sub $push21=, $pop20, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $3
-; NO-SIMD128-NEXT:    i32.const $push41=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push40=, $pop24, $pop41
-; NO-SIMD128-NEXT:    local.tee $push39=, $8=, $pop40
-; NO-SIMD128-NEXT:    i32.xor $push25=, $3, $pop39
-; NO-SIMD128-NEXT:    i32.sub $push26=, $pop25, $8
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push27=, $2
-; NO-SIMD128-NEXT:    i32.const $push38=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push37=, $pop27, $pop38
-; NO-SIMD128-NEXT:    local.tee $push36=, $8=, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push28=, $2, $pop36
-; NO-SIMD128-NEXT:    i32.sub $push29=, $pop28, $8
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
-; NO-SIMD128-NEXT:    i32.extend16_s $push30=, $1
-; NO-SIMD128-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push34=, $pop30, $pop35
-; NO-SIMD128-NEXT:    local.tee $push33=, $8=, $pop34
-; NO-SIMD128-NEXT:    i32.xor $push31=, $1, $pop33
-; NO-SIMD128-NEXT:    i32.sub $push32=, $pop31, $8
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $1
+; NO-SIMD128-NEXT:    i32.const $push27=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push26=, $pop22, $pop27
+; NO-SIMD128-NEXT:    local.tee $push25=, $8=, $pop26
+; NO-SIMD128-NEXT:    i32.xor $push23=, $1, $pop25
+; NO-SIMD128-NEXT:    i32.sub $push24=, $pop23, $8
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v8i16:
@@ -7419,68 +6123,60 @@ define <8 x i16> @abs_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push0=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop0, $pop1
-; NO-SIMD128-FAST-NEXT:    local.tee $push54=, $9=, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push47=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    local.tee $push46=, $9=, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop2, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push4=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push52=, $pop4, $pop53
-; NO-SIMD128-FAST-NEXT:    local.tee $push51=, $1=, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push44=, $pop4, $pop45
+; NO-SIMD128-FAST-NEXT:    local.tee $push43=, $1=, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push49=, $pop7, $pop50
-; NO-SIMD128-FAST-NEXT:    local.tee $push48=, $2=, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push41=, $pop7, $pop42
+; NO-SIMD128-FAST-NEXT:    local.tee $push40=, $2=, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop40
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop8, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push46=, $pop10, $pop47
-; NO-SIMD128-FAST-NEXT:    local.tee $push45=, $3=, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push38=, $pop10, $pop39
+; NO-SIMD128-FAST-NEXT:    local.tee $push37=, $3=, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push43=, $pop15, $pop44
-; NO-SIMD128-FAST-NEXT:    local.tee $push42=, $4=, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $5, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop16, $4
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop18, $pop41
-; NO-SIMD128-FAST-NEXT:    local.tee $push39=, $5=, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $6, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop19, $5
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push37=, $pop23, $pop38
-; NO-SIMD128-FAST-NEXT:    local.tee $push36=, $6=, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $7, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $pop24, $6
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push34=, $pop28, $pop35
-; NO-SIMD128-FAST-NEXT:    local.tee $push33=, $0=, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $0
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop13, $pop36
+; NO-SIMD128-FAST-NEXT:    local.tee $push34=, $4=, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $5, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop14, $4
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop16, $pop33
+; NO-SIMD128-FAST-NEXT:    local.tee $push31=, $5=, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $6, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.sub $push18=, $pop17, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push29=, $pop19, $pop30
+; NO-SIMD128-FAST-NEXT:    local.tee $push28=, $6=, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.sub $push21=, $pop20, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push26=, $pop22, $pop27
+; NO-SIMD128-FAST-NEXT:    local.tee $push25=, $7=, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.sub $push24=, $pop23, $7
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> zeroinitializer, %x
   %b = icmp slt <8 x i16> %x, zeroinitializer
@@ -7505,37 +6201,29 @@ define <8 x i16> @neg_v8i16(<8 x i16> %x) {
 ; NO-SIMD128:         .functype neg_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $5
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop23, $3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop22, $2
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop21, $1
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, 0
-; NO-SIMD128-NEXT:    i32.sub $push5=, $pop20, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, 0
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop19, $7
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 0
-; NO-SIMD128-NEXT:    i32.sub $push11=, $pop18, $6
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, 0
-; NO-SIMD128-NEXT:    i32.sub $push14=, $pop17, $4
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $8
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop15, $7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop14, $6
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop13, $5
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, 0
+; NO-SIMD128-NEXT:    i32.sub $push5=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, 0
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop11, $3
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 0
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop10, $2
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, 0
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v8i16:
@@ -7544,35 +6232,27 @@ define <8 x i16> @neg_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop23, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop15, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop22, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop14, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop21, $4
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop20, $5
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop18, $7
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop17, $8
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop13, $4
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop11, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop10, $7
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop9, $8
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
                      %x
@@ -7596,64 +6276,48 @@ define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128:         .functype shl_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $9, $pop0
-; NO-SIMD128-NEXT:    local.tee $push17=, $9=, $pop18
-; NO-SIMD128-NEXT:    i32.shl $push1=, $5, $pop17
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.shl $push5=, $8, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.shl $push8=, $7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.shl $push14=, $4, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop0
+; NO-SIMD128-NEXT:    local.tee $push9=, $9=, $pop10
+; NO-SIMD128-NEXT:    i32.shl $push1=, $8, $pop9
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $7, $9
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $6, $9
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $9
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push5=, $4, $9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.shl $push7=, $2, $9
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v8i16:
 ; NO-SIMD128-FAST:         .functype shl_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $9=, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push9=, $9=, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -7681,37 +6345,29 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
 ; NO-SIMD128:         .functype shl_const_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $pop22
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, 5
-; NO-SIMD128-NEXT:    i32.shl $push5=, $8, $pop20
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, 5
-; NO-SIMD128-NEXT:    i32.shl $push8=, $7, $pop19
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-NEXT:    i32.shl $push14=, $4, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.shl $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $7, $pop15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $6, $pop14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $pop13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, 5
+; NO-SIMD128-NEXT:    i32.shl $push5=, $4, $pop12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, 5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 5
+; NO-SIMD128-NEXT:    i32.shl $push7=, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, 5
+; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v8i16:
@@ -7720,35 +6376,27 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <8 x i16> %v,
     <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
@@ -7866,45 +6514,37 @@ define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128:         .functype shl_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-NEXT:    i32.and $push3=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.shl $push4=, $3, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $10, $pop30
-; NO-SIMD128-NEXT:    i32.shl $push6=, $2, $pop5
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $9, $pop29
-; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $pop7
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-NEXT:    i32.and $push9=, $16, $pop28
-; NO-SIMD128-NEXT:    i32.shl $push10=, $8, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $15, $pop27
-; NO-SIMD128-NEXT:    i32.shl $push14=, $7, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-NEXT:    i32.and $push17=, $14, $pop26
-; NO-SIMD128-NEXT:    i32.shl $push18=, $6, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-NEXT:    i32.and $push21=, $12, $pop25
-; NO-SIMD128-NEXT:    i32.shl $push22=, $4, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.shl $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-NEXT:    i32.and $push3=, $15, $pop23
+; NO-SIMD128-NEXT:    i32.shl $push4=, $7, $pop3
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $14, $pop22
+; NO-SIMD128-NEXT:    i32.shl $push6=, $6, $pop5
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop21
+; NO-SIMD128-NEXT:    i32.shl $push8=, $5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-NEXT:    i32.and $push9=, $12, $pop20
+; NO-SIMD128-NEXT:    i32.shl $push10=, $4, $pop9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $11, $pop19
+; NO-SIMD128-NEXT:    i32.shl $push12=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $10, $pop18
+; NO-SIMD128-NEXT:    i32.shl $push14=, $2, $pop13
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-NEXT:    i32.and $push15=, $9, $pop17
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v8i16:
@@ -7914,42 +6554,34 @@ define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $10, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $10, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $11, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $11, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $12, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $13, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $14, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $16, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -7971,41 +6603,33 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: shr_s_v8i16:
 ; NO-SIMD128:         .functype shr_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push26=, $9, $pop0
-; NO-SIMD128-NEXT:    local.tee $push25=, $9=, $pop26
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop25
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $3
+; NO-SIMD128-NEXT:    i32.and $push18=, $9, $pop0
+; NO-SIMD128-NEXT:    local.tee $push17=, $9=, $pop18
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop17
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $7
 ; NO-SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $2
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $6
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $1
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $5
 ; NO-SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $8
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $4
 ; NO-SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $7
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $3
+; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $2
 ; NO-SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $6
-; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $4
-; NO-SIMD128-NEXT:    i32.shr_s $push22=, $pop21, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend16_s $push15=, $1
+; NO-SIMD128-NEXT:    i32.shr_s $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v8i16:
@@ -8013,9 +6637,9 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push1=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push25=, $1=, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $1=, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push3=, $2
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push4=, $pop3, $1
@@ -8023,29 +6647,21 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $3
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $4
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push8=, $pop7, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $pop9, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $pop15, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $8
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -8164,54 +6780,46 @@ define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v8i16:
 ; NO-SIMD128:         .functype shr_s_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push2=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push2=, $8
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.shr_s $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $3
-; NO-SIMD128-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop39
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop31
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $2
-; NO-SIMD128-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop38
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $6
+; NO-SIMD128-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop30
 ; NO-SIMD128-NEXT:    i32.shr_s $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $1
-; NO-SIMD128-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop37
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $5
+; NO-SIMD128-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop29
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push14=, $8
-; NO-SIMD128-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $16, $pop36
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push14=, $4
+; NO-SIMD128-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $12, $pop28
 ; NO-SIMD128-NEXT:    i32.shr_s $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $7
-; NO-SIMD128-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $15, $pop35
-; NO-SIMD128-NEXT:    i32.shr_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $6
-; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $14, $pop34
-; NO-SIMD128-NEXT:    i32.shr_s $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $4
-; NO-SIMD128-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop33
-; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $3
+; NO-SIMD128-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-NEXT:    i32.and $push16=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $2
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $1
+; NO-SIMD128-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v8i16:
@@ -8223,48 +6831,40 @@ define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push8=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $13, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $14, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop19), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push26=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push27=, $pop26, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop24), $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push31=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $16, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop29), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push14=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $13, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $16, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -8287,48 +6887,40 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128:         .functype shr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $9, $pop34
-; NO-SIMD128-NEXT:    local.tee $push32=, $9=, $pop33
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop32
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-NEXT:    i32.and $push3=, $3, $pop31
+; NO-SIMD128-NEXT:    i32.and $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $9, $pop26
+; NO-SIMD128-NEXT:    local.tee $push24=, $9=, $pop25
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop24
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-NEXT:    i32.and $push3=, $7, $pop23
 ; NO-SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $2, $pop30
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $6, $pop22
 ; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $1, $pop29
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $5, $pop21
 ; NO-SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-NEXT:    i32.and $push9=, $8, $pop28
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-NEXT:    i32.and $push9=, $4, $pop20
 ; NO-SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $7, $pop27
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $2, $pop18
 ; NO-SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-NEXT:    i32.and $push17=, $6, $pop26
-; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-NEXT:    i32.and $push21=, $4, $pop25
-; NO-SIMD128-NEXT:    i32.shr_u $push22=, $pop21, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-NEXT:    i32.and $push15=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v8i16:
@@ -8336,47 +6928,39 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $pop34
-; NO-SIMD128-FAST-NEXT:    local.tee $push32=, $1=, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $9, $pop26
+; NO-SIMD128-FAST-NEXT:    local.tee $push24=, $1=, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop24
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push8=, $pop7, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $5, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push10=, $pop9, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $6, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $7, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push14=, $pop13, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $7, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $8, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push22=, $pop21, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -8496,61 +7080,53 @@ define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128:         .functype shr_u_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop47
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $16, $pop39
-; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop39
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $15, $pop37
-; NO-SIMD128-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop37
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $6, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $14, $pop35
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop35
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop33
-; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop33
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 65535
+; NO-SIMD128-NEXT:    i32.and $push14=, $4, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $12, $pop31
+; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $3, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-NEXT:    i32.and $push16=, $11, $pop29
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-NEXT:    i32.and $push20=, $2, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $10, $pop27
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push23=, $1, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v8i16:
@@ -8558,60 +7134,52 @@ define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $13, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $14, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $15, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $16, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $13, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $14, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $16, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -8633,30 +7201,22 @@ define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: and_v8i16:
 ; NO-SIMD128:         .functype and_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.and $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.and $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.and $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.and $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.and $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.and $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.and $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v8i16:
@@ -8668,24 +7228,16 @@ define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8707,30 +7259,22 @@ define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: or_v8i16:
 ; NO-SIMD128:         .functype or_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.or $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.or $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.or $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.or $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.or $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.or $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.or $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.or $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.or $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.or $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v8i16:
@@ -8742,24 +7286,16 @@ define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8781,30 +7317,22 @@ define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: xor_v8i16:
 ; NO-SIMD128:         .functype xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.xor $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.xor $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.xor $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v8i16:
@@ -8816,24 +7344,16 @@ define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8856,37 +7376,29 @@ define <8 x i16> @not_v8i16(<8 x i16> %x) {
 ; NO-SIMD128:         .functype not_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $pop22
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $8, $pop20
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $pop19
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $6, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $4, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $7, $pop15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $6, $pop14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $5, $pop13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $4, $pop12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $1, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v8i16:
@@ -8895,35 +7407,27 @@ define <8 x i16> @not_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $5, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $6, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $7, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $8, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $5, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $7, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1,
                           i16 -1, i16 -1, i16 -1, i16 -1>
@@ -8948,45 +7452,37 @@ define <8 x i16> @andnot_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128:         .functype andnot_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.and $push4=, $3, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $10, $pop30
-; NO-SIMD128-NEXT:    i32.and $push6=, $2, $pop5
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $9, $pop29
-; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop7
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $16, $pop28
-; NO-SIMD128-NEXT:    i32.and $push10=, $8, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $15, $pop27
-; NO-SIMD128-NEXT:    i32.and $push14=, $7, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, -1
-; NO-SIMD128-NEXT:    i32.xor $push17=, $14, $pop26
-; NO-SIMD128-NEXT:    i32.and $push18=, $6, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, -1
-; NO-SIMD128-NEXT:    i32.xor $push21=, $12, $pop25
-; NO-SIMD128-NEXT:    i32.and $push22=, $4, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $pop23
+; NO-SIMD128-NEXT:    i32.and $push4=, $7, $pop3
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $14, $pop22
+; NO-SIMD128-NEXT:    i32.and $push6=, $6, $pop5
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $13, $pop21
+; NO-SIMD128-NEXT:    i32.and $push8=, $5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $12, $pop20
+; NO-SIMD128-NEXT:    i32.and $push10=, $4, $pop9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $11, $pop19
+; NO-SIMD128-NEXT:    i32.and $push12=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $10, $pop18
+; NO-SIMD128-NEXT:    i32.and $push14=, $2, $pop13
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $9, $pop17
+; NO-SIMD128-NEXT:    i32.and $push16=, $1, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v8i16:
@@ -8996,42 +7492,34 @@ define <8 x i16> @andnot_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $9, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $11, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $11, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $13, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $15, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $16, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <8 x i16> %y,
    <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -9058,62 +7546,54 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v8i16:
 ; NO-SIMD128:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.and $push0=, $16, $8
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $24, $pop2
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $7
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $pop47
-; NO-SIMD128-NEXT:    i32.and $push9=, $23, $pop8
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop7, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $6
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $6, $pop46
-; NO-SIMD128-NEXT:    i32.and $push15=, $22, $pop14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $7
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $pop39
+; NO-SIMD128-NEXT:    i32.and $push7=, $23, $pop6
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $6
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.and $push11=, $22, $pop10
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $5
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $pop37
+; NO-SIMD128-NEXT:    i32.and $push15=, $21, $pop14
 ; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $5
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push20=, $5, $pop45
-; NO-SIMD128-NEXT:    i32.and $push21=, $21, $pop20
-; NO-SIMD128-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $4
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push24=, $4, $pop44
-; NO-SIMD128-NEXT:    i32.and $push25=, $20, $pop24
-; NO-SIMD128-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $3
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $3, $pop43
-; NO-SIMD128-NEXT:    i32.and $push31=, $19, $pop30
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $4
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.and $push19=, $20, $pop18
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $3
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $3, $pop35
+; NO-SIMD128-NEXT:    i32.and $push23=, $19, $pop22
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $2
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.and $push27=, $18, $pop26
+; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $1
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $1, $pop33
+; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop30
 ; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $2
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push34=, $2, $pop42
-; NO-SIMD128-NEXT:    i32.and $push35=, $18, $pop34
-; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $1
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $1, $pop41
-; NO-SIMD128-NEXT:    i32.and $push39=, $17, $pop38
-; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v8i16:
@@ -9126,55 +7606,47 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $20, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $5, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $6, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $7, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $23, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $24, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $8, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.or $push38=, $pop35, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <8 x i16> %v1, %c
   %inv_mask = xor <8 x i16>
@@ -9203,46 +7675,38 @@ define <8 x i16> @bitselect_xor_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v8i16:
 ; NO-SIMD128:         .functype bitselect_xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 14
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $24
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $8
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $24
-; NO-SIMD128-NEXT:    i32.store16 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $23
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $7
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $23
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $22
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $6
-; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $22
-; NO-SIMD128-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.xor $push15=, $13, $21
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $5
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 6
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $20
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $4
-; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $20
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.xor $push23=, $11, $19
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $3
-; NO-SIMD128-NEXT:    i32.xor $push25=, $pop24, $19
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
-; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $18
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $2
-; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $18
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
-; NO-SIMD128-NEXT:    i32.xor $push29=, $9, $17
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $pop30, $17
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $23
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $14, $22
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $13, $21
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $12, $20
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $11, $19
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $10, $18
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $9, $17
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v8i16:
@@ -9260,34 +7724,26 @@ define <8 x i16> @bitselect_xor_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $12, $20
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $13, $21
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $14, $22
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $pop20, $22
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $15, $23
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $16, $24
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $pop29, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $pop30, $24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $20
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <8 x i16> %v1, %v2
  %and = and <8 x i16> %xor1, %c
@@ -9314,62 +7770,54 @@ define <8 x i16> @bitselect_xor_reversed_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v8i16:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $24
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $24
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $23
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $pop47
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $23
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.xor $push14=, $14, $22
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $6, $pop46
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $23
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $pop39
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $22
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $21
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $5, $pop37
 ; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $22
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.xor $push20=, $13, $21
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $5, $pop45
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.xor $push24=, $12, $20
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push23=, $4, $pop44
-; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $20
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.xor $push30=, $11, $19
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push29=, $3, $pop43
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $20
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $19
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $3, $pop35
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push24=, $pop23, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $18
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $17
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $1, $pop33
 ; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $19
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
-; NO-SIMD128-NEXT:    i32.xor $push34=, $10, $18
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push33=, $2, $pop42
-; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push36=, $pop35, $18
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
-; NO-SIMD128-NEXT:    i32.xor $push38=, $9, $17
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push37=, $1, $pop41
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v8i16:
@@ -9382,55 +7830,47 @@ define <8 x i16> @bitselect_xor_reversed_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $10, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $18
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $11, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $12, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $13, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $5, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $pop21, $21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $14, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $6, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $22
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $15, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $7, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $5, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $6, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $pop23, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $7, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $pop27, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $16, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $8, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <8 x i16> %v1, %v2
  %notc = xor <8 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1,
@@ -9458,46 +7898,38 @@ define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: extmul_low_s_v8i16:
 ; NO-SIMD128:         .functype extmul_low_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $21
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $24
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $19
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $23
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $18
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $22
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $17
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $21
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 14
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $24
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $23
-; NO-SIMD128-NEXT:    i32.mul $push19=, $pop18, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop21), $pop19
-; NO-SIMD128-NEXT:    i32.const $push25=, 10
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.store16 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $20
-; NO-SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop27
-; NO-SIMD128-NEXT:    i32.store16 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $19
+; NO-SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $18
+; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $17
+; NO-SIMD128-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_s_v8i16:
@@ -9515,34 +7947,26 @@ define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push6=, $19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $4
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $20
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $21
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $pop15, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $22
-; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
-; NO-SIMD128-FAST-NEXT:    i32.mul $push26=, $pop25, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push30=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $24
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $20
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $21
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $22
+; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $23
+; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $24
+; NO-SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -9572,46 +7996,38 @@ define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: extmul_high_s_v8i16:
 ; NO-SIMD128:         .functype extmul_high_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $29
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $27
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $31
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $26
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $30
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $25
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $29
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 14
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $16
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $32
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $28
 ; NO-SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $31
-; NO-SIMD128-NEXT:    i32.mul $push19=, $pop18, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop21), $pop19
-; NO-SIMD128-NEXT:    i32.const $push25=, 10
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $30
-; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.store16 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $28
-; NO-SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop27
-; NO-SIMD128-NEXT:    i32.store16 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $27
+; NO-SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $26
+; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $25
+; NO-SIMD128-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_s_v8i16:
@@ -9629,34 +8045,26 @@ define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push6=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $28
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $29
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $pop15, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $30
-; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $31
-; NO-SIMD128-FAST-NEXT:    i32.mul $push26=, $pop25, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push30=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $32
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $28
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $30
+; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $32
+; NO-SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -9687,61 +8095,53 @@ define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128:         .functype extmul_low_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $21, $pop47
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $19, $pop45
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $18, $pop43
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $17, $pop41
-; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $24, $pop39
-; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $24, $pop39
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-NEXT:    i32.and $push18=, $23, $pop37
-; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $23, $pop37
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-NEXT:    i32.and $push24=, $6, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $22, $pop35
-; NO-SIMD128-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $22, $pop35
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-NEXT:    i32.and $push29=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-NEXT:    i32.and $push28=, $20, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $21, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $4, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $20, $pop31
+; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $3, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $19, $pop29
+; NO-SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $2, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $18, $pop27
+; NO-SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $1, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $17, $pop25
+; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_u_v8i16:
@@ -9749,60 +8149,52 @@ define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $22, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $23, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $24, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -9833,61 +8225,53 @@ define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128:         .functype extmul_high_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $29, $pop47
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $11, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $27, $pop45
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $10, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $26, $pop43
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $9, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $25, $pop41
-; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $16, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $32, $pop39
-; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop39
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $15, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-NEXT:    i32.and $push18=, $31, $pop37
-; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop37
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-NEXT:    i32.and $push24=, $14, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $14, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $30, $pop35
-; NO-SIMD128-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop35
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-NEXT:    i32.and $push29=, $12, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $13, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-NEXT:    i32.and $push28=, $28, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $12, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop31
+; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $11, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop29
+; NO-SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $10, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop27
+; NO-SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $9, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop25
+; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_u_v8i16:
@@ -9895,60 +8279,52 @@ define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $26, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $11, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $27, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $28, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $13, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $29, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $14, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $30, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $26, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $15, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $11, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $31, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $27, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $32, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $28, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $13, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $29, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $14, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $15, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $31, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $16, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $32, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -9979,16 +8355,14 @@ define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: add_v4i32:
 ; NO-SIMD128:         .functype add_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.add $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.add $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v4i32:
@@ -10000,10 +8374,8 @@ define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10025,16 +8397,14 @@ define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: sub_v4i32:
 ; NO-SIMD128:         .functype sub_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.sub $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v4i32:
@@ -10046,10 +8416,8 @@ define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10071,16 +8439,14 @@ define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: mul_v4i32:
 ; NO-SIMD128:         .functype mul_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.mul $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v4i32:
@@ -10092,10 +8458,8 @@ define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10117,20 +8481,18 @@ define <4 x i32> @min_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: min_s_v4i32:
 ; NO-SIMD128:         .functype min_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.lt_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.lt_s $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.lt_s $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.lt_s $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.lt_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.lt_s $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.lt_s $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v4i32:
@@ -10145,11 +8507,9 @@ define <4 x i32> @min_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10172,20 +8532,18 @@ define <4 x i32> @min_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: min_u_v4i32:
 ; NO-SIMD128:         .functype min_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.lt_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.lt_u $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.lt_u $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.lt_u $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.lt_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.lt_u $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.lt_u $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.lt_u $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v4i32:
@@ -10200,11 +8558,9 @@ define <4 x i32> @min_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10227,20 +8583,18 @@ define <4 x i32> @max_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: max_s_v4i32:
 ; NO-SIMD128:         .functype max_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.gt_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.gt_s $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.gt_s $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.gt_s $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.gt_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.gt_s $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.gt_s $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v4i32:
@@ -10255,11 +8609,9 @@ define <4 x i32> @max_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10282,20 +8634,18 @@ define <4 x i32> @max_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: max_u_v4i32:
 ; NO-SIMD128:         .functype max_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.gt_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.gt_u $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.gt_u $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.gt_u $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.gt_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.gt_u $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.gt_u $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.gt_u $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v4i32:
@@ -10310,11 +8660,9 @@ define <4 x i32> @max_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10337,63 +8685,59 @@ define <4 x i32> @abs_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-LABEL: abs_v4i32:
 ; NO-SIMD128:         .functype abs_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push0=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push21=, $4, $pop0
-; NO-SIMD128-NEXT:    local.tee $push20=, $5=, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.shr_s $push19=, $4, $pop0
+; NO-SIMD128-NEXT:    local.tee $push18=, $5=, $pop19
+; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop18
 ; NO-SIMD128-NEXT:    i32.sub $push2=, $pop1, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push19=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push18=, $3, $pop19
-; NO-SIMD128-NEXT:    local.tee $push17=, $4=, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push17=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push16=, $3, $pop17
+; NO-SIMD128-NEXT:    local.tee $push15=, $4=, $pop16
+; NO-SIMD128-NEXT:    i32.xor $push3=, $3, $pop15
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop3, $4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push14=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push13=, $2, $pop14
+; NO-SIMD128-NEXT:    local.tee $push12=, $4=, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push5=, $2, $pop12
 ; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $4
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push16=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push15=, $2, $pop16
-; NO-SIMD128-NEXT:    local.tee $push14=, $4=, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push7=, $2, $pop14
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push11=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push10=, $1, $pop11
+; NO-SIMD128-NEXT:    local.tee $push9=, $4=, $pop10
+; NO-SIMD128-NEXT:    i32.xor $push7=, $1, $pop9
 ; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push13=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push12=, $1, $pop13
-; NO-SIMD128-NEXT:    local.tee $push11=, $4=, $pop12
-; NO-SIMD128-NEXT:    i32.xor $push9=, $1, $pop11
-; NO-SIMD128-NEXT:    i32.sub $push10=, $pop9, $4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop10
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v4i32:
 ; NO-SIMD128-FAST:         .functype abs_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push20=, $5=, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push19=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push18=, $5=, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop1, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $2, $pop19
-; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $1=, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    local.tee $push15=, $1=, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $3, $pop16
-; NO-SIMD128-FAST-NEXT:    local.tee $push14=, $2=, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $3, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push13=, $3, $pop14
+; NO-SIMD128-FAST-NEXT:    local.tee $push12=, $2=, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $3, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $4, $pop13
-; NO-SIMD128-FAST-NEXT:    local.tee $push11=, $0=, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop7, $0
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $4, $pop11
+; NO-SIMD128-FAST-NEXT:    local.tee $push9=, $3=, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop7, $3
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> zeroinitializer, %x
   %b = icmp slt <4 x i32> %x, zeroinitializer
@@ -10418,19 +8762,17 @@ define <4 x i32> @neg_v4i32(<4 x i32> %x) {
 ; NO-SIMD128:         .functype neg_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $3
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop9, $2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop8, $1
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $4
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop7, $4
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop7, $3
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop5, $1
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v4i32:
@@ -10439,17 +8781,15 @@ define <4 x i32> @neg_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop9, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop7, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop8, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop7, $4
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop5, $4
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> <i32 0, i32 0, i32 0, i32 0>, %x
   ret <4 x i32> %a
@@ -10471,16 +8811,14 @@ define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shl_v4i32:
 ; NO-SIMD128:         .functype shl_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shl $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shl $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shl $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v4i32:
@@ -10492,10 +8830,8 @@ define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10523,19 +8859,17 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
 ; NO-SIMD128:         .functype shl_const_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $pop8
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.shl $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $pop7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $pop6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $pop5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v4i32:
@@ -10544,17 +8878,15 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <4 x i32> %v, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %a
@@ -10606,16 +8938,14 @@ define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shl_vec_v4i32:
 ; NO-SIMD128:         .functype shl_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shl $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shl $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shl $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v4i32:
@@ -10627,10 +8957,8 @@ define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10652,16 +8980,14 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shr_s_v4i32:
 ; NO-SIMD128:         .functype shr_s_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_s $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_s $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_s $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_s $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v4i32:
@@ -10673,10 +8999,8 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10731,16 +9055,14 @@ define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v4i32:
 ; NO-SIMD128:         .functype shr_s_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_s $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_s $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v4i32:
@@ -10752,10 +9074,8 @@ define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10777,16 +9097,14 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shr_u_v4i32:
 ; NO-SIMD128:         .functype shr_u_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_u $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_u $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_u $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_u $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v4i32:
@@ -10798,10 +9116,8 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10856,16 +9172,14 @@ define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shr_u_vec_v4i32:
 ; NO-SIMD128:         .functype shr_u_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_u $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_u $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v4i32:
@@ -10877,10 +9191,8 @@ define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10902,16 +9214,14 @@ define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: and_v4i32:
 ; NO-SIMD128:         .functype and_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.and $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v4i32:
@@ -10923,10 +9233,8 @@ define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10948,16 +9256,14 @@ define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: or_v4i32:
 ; NO-SIMD128:         .functype or_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.or $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.or $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v4i32:
@@ -10969,10 +9275,8 @@ define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10994,16 +9298,14 @@ define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: xor_v4i32:
 ; NO-SIMD128:         .functype xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.xor $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v4i32:
@@ -11015,10 +9317,8 @@ define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -11041,19 +9341,17 @@ define <4 x i32> @not_v4i32(<4 x i32> %x) {
 ; NO-SIMD128:         .functype not_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $pop8
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $pop7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $pop6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $pop5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v4i32:
@@ -11062,17 +9360,15 @@ define <4 x i32> @not_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   ret <4 x i32> %a
@@ -11096,23 +9392,21 @@ define <4 x i32> @andnot_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128:         .functype andnot_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $7, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push13=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $6, $pop13
-; NO-SIMD128-NEXT:    i32.and $push4=, $2, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push12=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $5, $pop12
-; NO-SIMD128-NEXT:    i32.and $push6=, $1, $pop5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $4, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    i32.const $push11=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $8, $pop11
-; NO-SIMD128-NEXT:    i32.and $push8=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $pop11
+; NO-SIMD128-NEXT:    i32.and $push4=, $3, $pop3
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $6, $pop10
+; NO-SIMD128-NEXT:    i32.and $push6=, $2, $pop5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $5, $pop9
+; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop7
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v4i32:
@@ -11122,20 +9416,18 @@ define <4 x i32> @andnot_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $5, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push12=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $7, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $7, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
  %a = and <4 x i32> %x, %inv_y
@@ -11161,32 +9453,30 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v4i32:
 ; NO-SIMD128:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $4, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $12
 ; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop3, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $3, $pop21
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $11
-; NO-SIMD128-NEXT:    i32.and $push7=, $3, $7
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop9, $pop7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push12=, $2, $pop20
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $10
-; NO-SIMD128-NEXT:    i32.and $push11=, $2, $6
-; NO-SIMD128-NEXT:    i32.or $push14=, $pop13, $pop11
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop14
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push16=, $1, $pop19
-; NO-SIMD128-NEXT:    i32.and $push17=, $pop16, $9
-; NO-SIMD128-NEXT:    i32.and $push15=, $1, $5
-; NO-SIMD128-NEXT:    i32.or $push18=, $pop17, $pop15
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop18
+; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $11
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $7
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop7, $pop5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $10
+; NO-SIMD128-NEXT:    i32.and $push9=, $2, $6
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop11, $pop9
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $9
+; NO-SIMD128-NEXT:    i32.and $push13=, $1, $5
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop15, $pop13
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v4i32:
@@ -11198,26 +9488,24 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop3, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $10
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $6
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop7, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $11
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop11, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $12
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop15, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <4 x i32> %c, %v1
   %inv_mask = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %c
@@ -11244,24 +9532,22 @@ define <4 x i32> @bitselect_xor_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v4i32:
 ; NO-SIMD128:         .functype bitselect_xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $12
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $4
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $12
-; NO-SIMD128-NEXT:    i32.store 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $11
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $3
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $11
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
-; NO-SIMD128-NEXT:    i32.xor $push8=, $6, $10
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $2
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $10
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop10
-; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $9
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $pop12, $9
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop13
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $11
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $3
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $6, $10
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $5, $9
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v4i32:
@@ -11279,12 +9565,10 @@ define <4 x i32> @bitselect_xor_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $8, $12
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $12
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <4 x i32> %v1, %v2
  %and = and <4 x i32> %xor1, %c
@@ -11311,32 +9595,30 @@ define <4 x i32> @bitselect_xor_reversed_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v4i32:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $12
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $12
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $11
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $3, $pop21
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $11
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop10
-; NO-SIMD128-NEXT:    i32.xor $push12=, $6, $10
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $2, $pop20
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $10
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop14
-; NO-SIMD128-NEXT:    i32.xor $push16=, $5, $9
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $11
 ; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $1, $pop19
-; NO-SIMD128-NEXT:    i32.and $push17=, $pop16, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop18
+; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $10
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $9
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v4i32:
@@ -11349,25 +9631,23 @@ define <4 x i32> @bitselect_xor_reversed_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $10
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $7, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $8, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $12
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <4 x i32> %v1, %v2
  %notc = xor <4 x i32> %c, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -11394,24 +9674,22 @@ define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: extmul_low_s_v4i32:
 ; NO-SIMD128:         .functype extmul_low_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $11
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $12
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $10
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $11
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $9
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $10
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 12
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $12
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $9
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_s_v4i32:
@@ -11429,12 +9707,10 @@ define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push6=, $11
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $4
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $12
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $12
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -11464,24 +9740,22 @@ define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: extmul_high_s_v4i32:
 ; NO-SIMD128:         .functype extmul_high_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $15
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $14
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $15
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $13
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $14
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 12
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $8
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $16
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $13
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_s_v4i32:
@@ -11499,12 +9773,10 @@ define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push6=, $15
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $16
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $16
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -11535,31 +9807,29 @@ define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128:         .functype extmul_low_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $11, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $2, $pop20
+; NO-SIMD128-NEXT:    i32.and $push2=, $4, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $10, $pop19
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push1=, $12, $pop19
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop18
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop18
 ; NO-SIMD128-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $9, $pop17
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop17
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $4, $pop16
+; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop16
 ; NO-SIMD128-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $12, $pop15
+; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop14
+; NO-SIMD128-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop13
 ; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop12
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_u_v4i32:
@@ -11567,30 +9837,28 @@ define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -11621,31 +9889,29 @@ define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128:         .functype extmul_high_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $7, $pop0
-; NO-SIMD128-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $15, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $6, $pop20
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop19
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $5, $pop18
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop18
 ; NO-SIMD128-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop17
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop17
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $8, $pop16
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop16
 ; NO-SIMD128-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $16, $pop15
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop14
+; NO-SIMD128-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop13
 ; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop12
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_u_v4i32:
@@ -11653,30 +9919,28 @@ define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $13, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $13, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $14, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $7, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $15, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $14, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $8, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $7, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $15, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $8, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $16, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -13061,16 +11325,14 @@ define <4 x float> @neg_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: neg_v4f32:
 ; NO-SIMD128:         .functype neg_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.neg $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.neg $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.neg $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.neg $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.neg $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.neg $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.neg $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.neg $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v4f32:
@@ -13082,10 +11344,8 @@ define <4 x float> @neg_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.neg $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.neg $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.neg $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fsub nsz <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, %x
   ret <4 x float> %a
@@ -13108,16 +11368,14 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: abs_v4f32:
 ; NO-SIMD128:         .functype abs_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.abs $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.abs $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.abs $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.abs $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.abs $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.abs $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.abs $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.abs $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v4f32:
@@ -13129,10 +11387,8 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.abs $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.abs $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.abs $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
   ret <4 x float> %a
@@ -13157,54 +11413,50 @@ define <4 x float> @min_unordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype min_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.gt $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.gt $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.gt $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.gt $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_unordered_v4f32:
 ; NO-SIMD128-FAST:         .functype min_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.gt $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.gt $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.gt $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.gt $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ule <4 x float> %x, <float 5., float 5., float 5., float 5.>
   %a = select <4 x i1> %cmps, <4 x float> %x,
@@ -13231,54 +11483,50 @@ define <4 x float> @max_unordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype max_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.lt $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.lt $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.lt $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.lt $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_unordered_v4f32:
 ; NO-SIMD128-FAST:         .functype max_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.lt $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.lt $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.lt $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.lt $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp uge <4 x float> %x, <float 5., float 5., float 5., float 5.>
   %a = select <4 x i1> %cmps, <4 x float> %x,
@@ -13305,54 +11553,50 @@ define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype min_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.ge $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.ge $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.ge $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.ge $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_ordered_v4f32:
 ; NO-SIMD128-FAST:         .functype min_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.ge $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.ge $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.ge $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.ge $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ole <4 x float> <float 5., float 5., float 5., float 5.>, %x
   %a = select <4 x i1> %cmps,
@@ -13379,54 +11623,50 @@ define <4 x float> @max_ordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype max_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.le $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.le $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.le $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.le $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_ordered_v4f32:
 ; NO-SIMD128-FAST:         .functype max_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.le $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.le $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.le $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.le $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp oge <4 x float> <float 5., float 5., float 5., float 5.>, %x
   %a = select <4 x i1> %cmps,
@@ -13451,16 +11691,14 @@ define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: min_intrinsic_v4f32:
 ; NO-SIMD128:         .functype min_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.min $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.min $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.min $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.min $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.min $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.min $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.min $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.min $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_intrinsic_v4f32:
@@ -13472,10 +11710,8 @@ define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.min $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.min $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.min $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13552,16 +11788,14 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: minnum_intrinsic_v4f32:
 ; NO-SIMD128:         .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fminf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fminf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fminf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: minnum_intrinsic_v4f32:
@@ -13573,10 +11807,8 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13598,16 +11830,14 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: minnum_nsz_intrinsic_v4f32:
 ; NO-SIMD128:         .functype minnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fminf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fminf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fminf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: minnum_nsz_intrinsic_v4f32:
@@ -13619,10 +11849,8 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan nsz <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13647,19 +11875,17 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128:         .functype fminnumv432_non_zero_intrinsic (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $pop9
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    call $push1=, fminf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
-; NO-SIMD128-NEXT:    call $push6=, fminf, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-NEXT:    call $push2=, fminf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push3=, fminf, $2, $pop6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    f32.const $push5=, -0x1p0
+; NO-SIMD128-NEXT:    call $push4=, fminf, $1, $pop5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: fminnumv432_non_zero_intrinsic:
@@ -13668,17 +11894,15 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, -0x1p0
 ; NO-SIMD128-FAST-NEXT:    call $push1=, fminf, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push6=, fminf, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push5=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float -1.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -13755,19 +11979,17 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128:         .functype fminnumv432_one_zero_intrinsic (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x0p0
-; NO-SIMD128-NEXT:    call $push3=, fminf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fminf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fminf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fminf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fminf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x0p0
+; NO-SIMD128-NEXT:    call $push4=, fminf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fminf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: fminnumv432_one_zero_intrinsic:
@@ -13779,14 +12001,12 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x0p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fminf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 0.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -13809,16 +12029,14 @@ define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: max_intrinsic_v4f32:
 ; NO-SIMD128:         .functype max_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.max $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.max $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.max $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.max $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.max $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.max $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.max $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.max $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_intrinsic_v4f32:
@@ -13830,10 +12048,8 @@ define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.max $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.max $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.max $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13910,16 +12126,14 @@ define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: maxnum_intrinsic_v4f32:
 ; NO-SIMD128:         .functype maxnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fmaxf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fmaxf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fmaxf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fmaxf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_intrinsic_v4f32:
@@ -13931,10 +12145,8 @@ define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fmaxf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13956,16 +12168,14 @@ define <4 x float> @maxnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: maxnum_nsz_intrinsic_v4f32:
 ; NO-SIMD128:         .functype maxnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fmaxf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fmaxf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fmaxf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fmaxf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_nsz_intrinsic_v4f32:
@@ -13977,10 +12187,8 @@ define <4 x float> @maxnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fmaxf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan nsz <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -14057,19 +12265,17 @@ define <4 x float> @maxnum_one_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128:         .functype maxnum_one_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x0p0
-; NO-SIMD128-NEXT:    call $push3=, fmaxf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fmaxf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x0p0
+; NO-SIMD128-NEXT:    call $push4=, fmaxf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fmaxf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_one_zero_intrinsic_v4f32:
@@ -14081,14 +12287,12 @@ define <4 x float> @maxnum_one_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x0p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 0.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -14113,19 +12317,17 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128:         .functype maxnum_non_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x1p0
-; NO-SIMD128-NEXT:    call $push3=, fmaxf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fmaxf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x1p0
+; NO-SIMD128-NEXT:    call $push4=, fmaxf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fmaxf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_non_zero_intrinsic_v4f32:
@@ -14137,14 +12339,12 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x1p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 1.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -14240,20 +12440,18 @@ define <4 x float> @pmin_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: pmin_v4f32:
 ; NO-SIMD128:         .functype pmin_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.lt $push0=, $7, $3
-; NO-SIMD128-NEXT:    f32.select $push1=, $7, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.lt $push2=, $6, $2
-; NO-SIMD128-NEXT:    f32.select $push3=, $6, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.lt $push4=, $5, $1
-; NO-SIMD128-NEXT:    f32.select $push5=, $5, $1, $pop4
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    f32.lt $push6=, $8, $4
-; NO-SIMD128-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    f32.lt $push0=, $8, $4
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $7, $3
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $6, $2
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.lt $push6=, $5, $1
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmin_v4f32:
@@ -14268,11 +12466,9 @@ define <4 x float> @pmin_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $7, $3
 ; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $8, $4
 ; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <4 x float> %y, %x
   %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
@@ -14295,28 +12491,26 @@ define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: pmin_int_v4f32:
 ; NO-SIMD128:         .functype pmin_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push1=, $8
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push0=, $4
 ; NO-SIMD128-NEXT:    f32.lt $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $4, $pop2
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push7=, $7
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push6=, $3
-; NO-SIMD128-NEXT:    f32.lt $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $3, $pop8
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push11=, $6
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push10=, $2
-; NO-SIMD128-NEXT:    f32.lt $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.select $push13=, $6, $2, $pop12
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop13
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push15=, $5
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push14=, $1
-; NO-SIMD128-NEXT:    f32.lt $push16=, $pop15, $pop14
-; NO-SIMD128-NEXT:    i32.select $push17=, $5, $1, $pop16
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push5=, $7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push4=, $3
+; NO-SIMD128-NEXT:    f32.lt $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $3, $pop6
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push9=, $6
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push8=, $2
+; NO-SIMD128-NEXT:    f32.lt $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop11
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push13=, $5
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push12=, $1
+; NO-SIMD128-NEXT:    f32.lt $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $1, $pop14
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmin_int_v4f32:
@@ -14337,13 +12531,11 @@ define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $7, $3, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push13=, $8
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push12=, $4
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $8, $4, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <4 x i32> %x to <4 x float>
   %fy = bitcast <4 x i32> %y to <4 x float>
@@ -14368,20 +12560,18 @@ define <4 x float> @pmax_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: pmax_v4f32:
 ; NO-SIMD128:         .functype pmax_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.lt $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.select $push1=, $7, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.lt $push2=, $2, $6
-; NO-SIMD128-NEXT:    f32.select $push3=, $6, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.lt $push4=, $1, $5
-; NO-SIMD128-NEXT:    f32.select $push5=, $5, $1, $pop4
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    f32.lt $push6=, $4, $8
-; NO-SIMD128-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    f32.lt $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.lt $push6=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmax_v4f32:
@@ -14396,11 +12586,9 @@ define <4 x float> @pmax_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <4 x float> %x, %y
   %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
@@ -14423,28 +12611,26 @@ define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: pmax_int_v4f32:
 ; NO-SIMD128:         .functype pmax_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push1=, $4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push0=, $8
 ; NO-SIMD128-NEXT:    f32.lt $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $4, $pop2
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push7=, $3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push6=, $7
-; NO-SIMD128-NEXT:    f32.lt $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $3, $pop8
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push11=, $2
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push10=, $6
-; NO-SIMD128-NEXT:    f32.lt $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.select $push13=, $6, $2, $pop12
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop13
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push15=, $1
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push14=, $5
-; NO-SIMD128-NEXT:    f32.lt $push16=, $pop15, $pop14
-; NO-SIMD128-NEXT:    i32.select $push17=, $5, $1, $pop16
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push5=, $3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push4=, $7
+; NO-SIMD128-NEXT:    f32.lt $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $3, $pop6
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push9=, $2
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push8=, $6
+; NO-SIMD128-NEXT:    f32.lt $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop11
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push13=, $1
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push12=, $5
+; NO-SIMD128-NEXT:    f32.lt $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $1, $pop14
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmax_int_v4f32:
@@ -14465,13 +12651,11 @@ define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $7, $3, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push12=, $8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $8, $4, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <4 x i32> %x to <4 x float>
   %fy = bitcast <4 x i32> %y to <4 x float>
@@ -14496,16 +12680,14 @@ define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: add_v4f32:
 ; NO-SIMD128:         .functype add_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.add $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.add $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.add $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.add $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.add $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.add $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.add $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.add $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v4f32:
@@ -14517,10 +12699,8 @@ define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.add $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.add $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.add $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fadd <4 x float> %x, %y
   ret <4 x float> %a
@@ -14542,16 +12722,14 @@ define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: sub_v4f32:
 ; NO-SIMD128:         .functype sub_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.sub $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.sub $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.sub $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.sub $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.sub $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.sub $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.sub $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.sub $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v4f32:
@@ -14563,10 +12741,8 @@ define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.sub $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.sub $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.sub $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fsub <4 x float> %x, %y
   ret <4 x float> %a
@@ -14588,16 +12764,14 @@ define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: div_v4f32:
 ; NO-SIMD128:         .functype div_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.div $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.div $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.div $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.div $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.div $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.div $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.div $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.div $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: div_v4f32:
@@ -14609,10 +12783,8 @@ define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.div $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.div $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.div $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fdiv <4 x float> %x, %y
   ret <4 x float> %a
@@ -14634,16 +12806,14 @@ define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: mul_v4f32:
 ; NO-SIMD128:         .functype mul_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.mul $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.mul $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.mul $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.mul $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.mul $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.mul $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.mul $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.mul $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v4f32:
@@ -14655,10 +12825,8 @@ define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.mul $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.mul $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.mul $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fmul <4 x float> %x, %y
   ret <4 x float> %a
@@ -14681,16 +12849,14 @@ define <4 x float> @sqrt_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: sqrt_v4f32:
 ; NO-SIMD128:         .functype sqrt_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.sqrt $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.sqrt $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.sqrt $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.sqrt $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.sqrt $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.sqrt $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.sqrt $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.sqrt $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sqrt_v4f32:
@@ -14702,10 +12868,8 @@ define <4 x float> @sqrt_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.sqrt $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.sqrt $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.sqrt $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
   ret <4 x float> %a
diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll
index d2a38de4cc85..5ec9f6a2a321 100644
--- a/llvm/test/CodeGen/WebAssembly/simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd.ll
@@ -38,44 +38,22 @@ define <16 x i8> @splat_v16i8(i8 %x) {
 ; NO-SIMD128-LABEL: splat_v16i8:
 ; NO-SIMD128:         .functype splat_v16i8 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $1
+; NO-SIMD128-NEXT:    i32.store8 14($0), $1
+; NO-SIMD128-NEXT:    i32.store8 13($0), $1
+; NO-SIMD128-NEXT:    i32.store8 12($0), $1
+; NO-SIMD128-NEXT:    i32.store8 11($0), $1
+; NO-SIMD128-NEXT:    i32.store8 10($0), $1
+; NO-SIMD128-NEXT:    i32.store8 9($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $1
+; NO-SIMD128-NEXT:    i32.store8 7($0), $1
+; NO-SIMD128-NEXT:    i32.store8 6($0), $1
+; NO-SIMD128-NEXT:    i32.store8 5($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $1
+; NO-SIMD128-NEXT:    i32.store8 3($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $1
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $1
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $1
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $1
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $1
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $1
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $1
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $1
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $1
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $1
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <16 x i8> undef, i8 %x, i32 0
   %res = shufflevector <16 x i8> %v, <16 x i8> undef,
@@ -356,44 +334,22 @@ define <16 x i8> @replace_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: replace_v16i8:
 ; NO-SIMD128:         .functype replace_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $17
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $17
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 11
   ret <16 x i8> %res
@@ -461,44 +417,22 @@ define <16 x i8> @replace_zero_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v16i8:
 ; NO-SIMD128:         .functype replace_zero_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $12
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $17
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $12
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 0
   ret <16 x i8> %res
@@ -514,44 +448,22 @@ define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: shuffle_v16i8:
 ; NO-SIMD128:         .functype shuffle_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $32
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $30
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $28
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $26
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $24
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $22
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $20
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $18
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $32
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $30
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $28
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $26
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $24
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $22
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $20
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <16 x i8> %x, <16 x i8> %y,
     <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23,
@@ -569,44 +481,22 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v16i8:
 ; NO-SIMD128:         .functype shuffle_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $2
+; NO-SIMD128-NEXT:    i32.store8 14($0), $2
+; NO-SIMD128-NEXT:    i32.store8 13($0), $2
+; NO-SIMD128-NEXT:    i32.store8 12($0), $2
+; NO-SIMD128-NEXT:    i32.store8 11($0), $2
+; NO-SIMD128-NEXT:    i32.store8 10($0), $2
+; NO-SIMD128-NEXT:    i32.store8 9($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $2
+; NO-SIMD128-NEXT:    i32.store8 7($0), $2
+; NO-SIMD128-NEXT:    i32.store8 6($0), $2
+; NO-SIMD128-NEXT:    i32.store8 5($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $2
+; NO-SIMD128-NEXT:    i32.store8 3($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $2
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $2
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $2
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $2
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $2
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $2
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $2
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $2
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $2
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $2
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <16 x i8> %x, <16 x i8> %y,
     <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
@@ -641,44 +531,22 @@ define <16 x i8> @build_v16i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3,
 ; NO-SIMD128-LABEL: build_v16i8:
 ; NO-SIMD128:         .functype build_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $12
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $12
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
                               i8 %x4, i8 %x5, i8 %x6, i8 %x7,
                               i8 %x8, i8 %x9, i8 %x10, i8 %x11,
@@ -734,22 +602,14 @@ define <8 x i16> @splat_v8i16(i16 %x) {
 ; NO-SIMD128-LABEL: splat_v8i16:
 ; NO-SIMD128:         .functype splat_v8i16 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $1
+; NO-SIMD128-NEXT:    i32.store16 12($0), $1
+; NO-SIMD128-NEXT:    i32.store16 10($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $1
+; NO-SIMD128-NEXT:    i32.store16 6($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $1
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $1
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $1
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <8 x i16> undef, i16 %x, i32 0
   %res = shufflevector <8 x i16> %v, <8 x i16> undef,
@@ -1016,22 +876,14 @@ define <8 x i16> @replace_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: replace_v8i16:
 ; NO-SIMD128:         .functype replace_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $9
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $9
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 7
   ret <8 x i16> %res
@@ -1095,22 +947,14 @@ define <8 x i16> @replace_zero_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v8i16:
 ; NO-SIMD128:         .functype replace_zero_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $9
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $8
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 0
   ret <8 x i16> %res
@@ -1126,22 +970,14 @@ define <8 x i16> @shuffle_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: shuffle_v8i16:
 ; NO-SIMD128:         .functype shuffle_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $16
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $14
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $12
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $10
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $12
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <8 x i16> %x, <8 x i16> %y,
     <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
@@ -1158,22 +994,14 @@ define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v8i16:
 ; NO-SIMD128:         .functype shuffle_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $2
+; NO-SIMD128-NEXT:    i32.store16 12($0), $2
+; NO-SIMD128-NEXT:    i32.store16 10($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $2
+; NO-SIMD128-NEXT:    i32.store16 6($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $2
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $2
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $2
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <8 x i16> %x, <8 x i16> %y,
     <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
@@ -1198,22 +1026,14 @@ define <8 x i16> @build_v8i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3,
 ; NO-SIMD128-LABEL: build_v8i16:
 ; NO-SIMD128:         .functype build_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $8
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
                               i16 %x4, i16 %x5, i16 %x6, i16 %x7) {
   %t0 = insertelement <8 x i16> undef, i16 %x0, i32 0
@@ -1258,12 +1078,10 @@ define <4 x i32> @splat_v4i32(i32 %x) {
 ; NO-SIMD128-LABEL: splat_v4i32:
 ; NO-SIMD128:         .functype splat_v4i32 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $1
 ; NO-SIMD128-NEXT:    i32.store 8($0), $1
 ; NO-SIMD128-NEXT:    i32.store 4($0), $1
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <4 x i32> undef, i32 %x, i32 0
   %res = shufflevector <4 x i32> %v, <4 x i32> undef,
@@ -1368,12 +1186,10 @@ define <4 x i32> @replace_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: replace_v4i32:
 ; NO-SIMD128:         .functype replace_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $5
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 2
   ret <4 x i32> %res
@@ -1433,12 +1249,10 @@ define <4 x i32> @replace_zero_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v4i32:
 ; NO-SIMD128:         .functype replace_zero_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $5
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 0
   ret <4 x i32> %res
@@ -1454,12 +1268,10 @@ define <4 x i32> @shuffle_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: shuffle_v4i32:
 ; NO-SIMD128:         .functype shuffle_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $8
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $6
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $8
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x i32> %x, <4 x i32> %y,
     <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1476,12 +1288,10 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v4i32:
 ; NO-SIMD128:         .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $2
 ; NO-SIMD128-NEXT:    i32.store 8($0), $2
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x i32> %x, <4 x i32> %y,
     <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1501,12 +1311,10 @@ define <4 x i32> @build_v4i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
 ; NO-SIMD128-LABEL: build_v4i32:
 ; NO-SIMD128:         .functype build_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %t0 = insertelement <4 x i32> undef, i32 %x0, i32 0
   %t1 = insertelement <4 x i32> %t0, i32 %x1, i32 1
@@ -1801,12 +1609,10 @@ define <4 x float> @splat_v4f32(float %x) {
 ; NO-SIMD128-LABEL: splat_v4f32:
 ; NO-SIMD128:         .functype splat_v4f32 (i32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $1
 ; NO-SIMD128-NEXT:    f32.store 8($0), $1
 ; NO-SIMD128-NEXT:    f32.store 4($0), $1
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <4 x float> undef, float %x, i32 0
   %res = shufflevector <4 x float> %v, <4 x float> undef,
@@ -1911,12 +1717,10 @@ define <4 x float> @replace_v4f32(<4 x float> %v, float %x) {
 ; NO-SIMD128-LABEL: replace_v4f32:
 ; NO-SIMD128:         .functype replace_v4f32 (i32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $5
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 2
   ret <4 x float> %res
@@ -1976,12 +1780,10 @@ define <4 x float> @replace_zero_v4f32(<4 x float> %v, float %x) {
 ; NO-SIMD128-LABEL: replace_zero_v4f32:
 ; NO-SIMD128:         .functype replace_zero_v4f32 (i32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $5
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 0
   ret <4 x float> %res
@@ -1997,12 +1799,10 @@ define <4 x float> @shuffle_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: shuffle_v4f32:
 ; NO-SIMD128:         .functype shuffle_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $8
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $6
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $8
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x float> %x, <4 x float> %y,
     <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -2019,12 +1819,10 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v4f32:
 ; NO-SIMD128:         .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $2
 ; NO-SIMD128-NEXT:    f32.store 8($0), $2
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x float> %x, <4 x float> %y,
     <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -2044,12 +1842,10 @@ define <4 x float> @build_v4f32(float %x0, float %x1, float %x2, float %x3) {
 ; NO-SIMD128-LABEL: build_v4f32:
 ; NO-SIMD128:         .functype build_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %t0 = insertelement <4 x float> undef, float %x0, i32 0
   %t1 = insertelement <4 x float> %t0, float %x1, i32 1
-- 
cgit v1.2.3


From 51f1cb5355d296ccb7756944d0545d9c96066b78 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 4 Apr 2024 08:04:21 -0700
Subject: [X86] Add or_is_add patterns for INC. (#87584)

Should fix the cases noted in #86857
---
 llvm/lib/Target/X86/X86InstrCompiler.td               |  5 +++++
 llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll | 11 +++++++----
 llvm/test/CodeGen/X86/load-local-v3i129.ll            |  4 ++--
 llvm/test/CodeGen/X86/pr23664.ll                      |  2 +-
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ce3b6af4cab4..270dd32c7235 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2161,6 +2161,11 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
     def : Pat<(X86sub_flag_nocf GR16:$src, -1), (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
     def : Pat<(X86sub_flag_nocf GR32:$src, -1), (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
     def : Pat<(X86sub_flag_nocf GR64:$src, -1), (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
+
+    def : Pat<(or_is_add GR8:$src, 1),   (!cast<Instruction>(INC8r#suffix) GR8:$src)>;
+    def : Pat<(or_is_add GR16:$src, 1),  (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
+    def : Pat<(or_is_add GR32:$src, 1),  (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
+    def : Pat<(or_is_add GR64:$src, 1),  (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
   }
 }
 
diff --git a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
index 609be3bb2e54..50e736ac68d2 100644
--- a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s | FileCheck %s
 
 ; Check that the shr(shl X, 56), 48) is not mistakenly turned into
@@ -16,11 +17,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 define i64 @foo(i64 %b) nounwind readnone {
-entry:
 ; CHECK-LABEL: foo:
-; CHECK: movsbq %dil, %rax
-; CHECK: shlq $8, %rax
-; CHECK: orq $1, %rax
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movsbq %dil, %rax
+; CHECK-NEXT:    shlq $8, %rax
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    retq
+entry:
 	%shl = shl i64 %b, 56		; <i64> [#uses=1]
 	%shr = ashr i64 %shl, 48		; <i64> [#uses=1]
 	%add5 = or i64 %shr, 1		; <i64> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll
index 8fa7ce066453..eb5d172a3b35 100644
--- a/llvm/test/CodeGen/X86/load-local-v3i129.ll
+++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll
@@ -12,7 +12,7 @@ define void @_start() nounwind {
 ; FAST-SHLD-NEXT:    shrq $2, %rcx
 ; FAST-SHLD-NEXT:    shldq $2, %rdx, %rcx
 ; FAST-SHLD-NEXT:    andq $-4, %rax
-; FAST-SHLD-NEXT:    orq $1, %rax
+; FAST-SHLD-NEXT:    incq %rax
 ; FAST-SHLD-NEXT:    movq %rax, -40(%rsp)
 ; FAST-SHLD-NEXT:    movq %rcx, -32(%rsp)
 ; FAST-SHLD-NEXT:    orq $-2, -56(%rsp)
@@ -23,7 +23,7 @@ define void @_start() nounwind {
 ; SLOW-SHLD:       # %bb.0: # %Entry
 ; SLOW-SHLD-NEXT:    movq -40(%rsp), %rax
 ; SLOW-SHLD-NEXT:    andq $-4, %rax
-; SLOW-SHLD-NEXT:    orq $1, %rax
+; SLOW-SHLD-NEXT:    incq %rax
 ; SLOW-SHLD-NEXT:    movq %rax, -40(%rsp)
 ; SLOW-SHLD-NEXT:    orq $-2, -56(%rsp)
 ; SLOW-SHLD-NEXT:    movq $-1, -48(%rsp)
diff --git a/llvm/test/CodeGen/X86/pr23664.ll b/llvm/test/CodeGen/X86/pr23664.ll
index 453e5db2bed6..8179602b8c2a 100644
--- a/llvm/test/CodeGen/X86/pr23664.ll
+++ b/llvm/test/CodeGen/X86/pr23664.ll
@@ -6,7 +6,7 @@ define i2 @f(i32 %arg) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NEXT:    leal (%rdi,%rdi), %eax
-; CHECK-NEXT:    orb $1, %al
+; CHECK-NEXT:    incb %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %trunc = trunc i32 %arg to i1
-- 
cgit v1.2.3


From 1b761205f2686516cebadbcbc37f798197d9c482 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 4 Apr 2024 16:11:06 +0100
Subject: [APInt] Add a simpler overload of multiplicativeInverse (#87610)

The current APInt::multiplicativeInverse takes a modulus which can be
any value, but all in-tree callers use a power of two. Moreover, most
callers want to use two to the power of the width of an existing APInt,
which is awkward because 2^N is not representable as an N-bit APInt.

Add a new overload of multiplicativeInverse which implicitly uses
2^BitWidth as the modulus.
---
 llvm/include/llvm/ADT/APInt.h                    |  3 +++
 llvm/lib/Analysis/ScalarEvolution.cpp            | 11 +++--------
 llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp   |  5 +----
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 23 +++++------------------
 llvm/lib/Support/APInt.cpp                       | 13 +++++++++++++
 llvm/unittests/ADT/APIntTest.cpp                 |  3 ++-
 6 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index b9b39f3b9dfb..bd1716219ee5 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -1743,6 +1743,9 @@ public:
   /// \returns the multiplicative inverse for a given modulo.
   APInt multiplicativeInverse(const APInt &modulo) const;
 
+  /// \returns the multiplicative inverse of an odd APInt modulo 2^BitWidth.
+  APInt multiplicativeInverse() const;
+
   /// @}
   /// \name Building-block Operations for APInt and APFloat
   /// @{
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 515b9d0744f6..e030b9fc7dac 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -944,10 +944,7 @@ static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
   // Calculate the multiplicative inverse of K! / 2^T;
   // this multiplication factor will perform the exact division by
   // K! / 2^T.
-  APInt Mod = APInt::getSignedMinValue(W+1);
-  APInt MultiplyFactor = OddFactorial.zext(W+1);
-  MultiplyFactor = MultiplyFactor.multiplicativeInverse(Mod);
-  MultiplyFactor = MultiplyFactor.trunc(W);
+  APInt MultiplyFactor = OddFactorial.multiplicativeInverse();
 
   // Calculate the product, at width T+W
   IntegerType *CalculationTy = IntegerType::get(SE.getContext(),
@@ -10086,10 +10083,8 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
   // If D == 1, (N / D) == N == 2^BW, so we need one extra bit to represent
   // (N / D) in general. The inverse itself always fits into BW bits, though,
   // so we immediately truncate it.
-  APInt AD = A.lshr(Mult2).zext(BW + 1);  // AD = A / D
-  APInt Mod(BW + 1, 0);
-  Mod.setBit(BW - Mult2);  // Mod = N / D
-  APInt I = AD.multiplicativeInverse(Mod).trunc(BW);
+  APInt AD = A.lshr(Mult2).trunc(BW - Mult2); // AD = A / D
+  APInt I = AD.multiplicativeInverse().zext(BW);
 
   // 4. Compute the minimum unsigned root of the equation:
   // I * (B / D) mod (N / D)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 062132c8304b..719209e0edd5 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5201,10 +5201,7 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) {
 
     // Calculate the multiplicative inverse modulo BW.
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
-    unsigned W = Divisor.getBitWidth();
-    APInt Factor = Divisor.zext(W + 1)
-                       .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                       .trunc(W);
+    APInt Factor = Divisor.multiplicativeInverse();
     Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
     Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
     return true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5e053f97675d..409d66adfd67 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6071,11 +6071,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
       Divisor.ashrInPlace(Shift);
       UseSRA = true;
     }
-    // Calculate the multiplicative inverse, using Newton's method.
-    APInt t;
-    APInt Factor = Divisor;
-    while ((t = Divisor * Factor) != 1)
-      Factor *= APInt(Divisor.getBitWidth(), 2) - t;
+    APInt Factor = Divisor.multiplicativeInverse();
     Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
     Factors.push_back(DAG.getConstant(Factor, dl, SVT));
     return true;
@@ -6664,10 +6660,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
     // P = inv(D0, 2^W)
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
     unsigned W = D.getBitWidth();
-    APInt P = D0.zext(W + 1)
-                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                  .trunc(W);
-    assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
+    APInt P = D0.multiplicativeInverse();
     assert((D0 * P).isOne() && "Multiplicative inverse basic check failed.");
 
     // Q = floor((2^W - 1) u/ D)
@@ -6922,10 +6915,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
     // P = inv(D0, 2^W)
     // 2^W requires W + 1 bits, so we have to extend and then truncate.
     unsigned W = D.getBitWidth();
-    APInt P = D0.zext(W + 1)
-                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                  .trunc(W);
-    assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
+    APInt P = D0.multiplicativeInverse();
     assert((D0 * P).isOne() && "Multiplicative inverse basic check failed.");
 
     // A = floor((2^(W - 1) - 1) / D0) & -2^K
@@ -7651,7 +7641,7 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 //
 // For division, we can compute the remainder using the algorithm described
 // above, subtract it from the dividend to get an exact multiple of Constant.
-// Then multiply that extact multiply by the multiplicative inverse modulo
+// Then multiply that exact multiply by the multiplicative inverse modulo
 // (1 << (BitWidth / 2)) to get the quotient.
 
 // If Constant is even, we can shift right the dividend and the divisor by the
@@ -7786,10 +7776,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
 
     // Multiply by the multiplicative inverse of the divisor modulo
     // (1 << BitWidth).
-    APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
-    APInt MulFactor = Divisor.zext(BitWidth + 1);
-    MulFactor = MulFactor.multiplicativeInverse(Mod);
-    MulFactor = MulFactor.trunc(BitWidth);
+    APInt MulFactor = Divisor.multiplicativeInverse();
 
     SDValue Quotient = DAG.getNode(ISD::MUL, dl, VT, Dividend,
                                    DAG.getConstant(MulFactor, dl, VT));
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index c20609748dc9..f8f699f8f6cc 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -1289,6 +1289,19 @@ APInt APInt::multiplicativeInverse(const APInt& modulo) const {
   return std::move(t[i]);
 }
 
+/// \returns the multiplicative inverse of an odd APInt modulo 2^BitWidth.
+APInt APInt::multiplicativeInverse() const {
+  assert((*this)[0] &&
+         "multiplicative inverse is only defined for odd numbers!");
+
+  // Use Newton's method.
+  APInt Factor = *this;
+  APInt T;
+  while (!(T = *this * Factor).isOne())
+    Factor *= 2 - T;
+  return Factor;
+}
+
 /// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
 /// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
 /// variables here have the same names as in the algorithm. Comments explain
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index d5ef63e38e27..23f9ee2d39c4 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -3257,9 +3257,10 @@ TEST(APIntTest, MultiplicativeInverseExaustive) {
               .multiplicativeInverse(APInt::getSignedMinValue(BitWidth + 1))
               .trunc(BitWidth);
       APInt One = V * MulInv;
-      if (!V.isZero() && V.countr_zero() == 0) {
+      if (V[0]) {
         // Multiplicative inverse exists for all odd numbers.
         EXPECT_TRUE(One.isOne());
+        EXPECT_TRUE((V * V.multiplicativeInverse()).isOne());
       } else {
         // Multiplicative inverse does not exist for even numbers (and 0).
         EXPECT_TRUE(MulInv.isZero());
-- 
cgit v1.2.3


From eb3819073ea7aa6ee06b698da5e8f8a9d08dff75 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Thu, 4 Apr 2024 17:25:44 +0200
Subject: [libc++][oss-fuzz] Updates C++ version used. (#87531)

This version update allows testing of `std::format` and the chrono
timezone parsing in oss-fuzz.
---
 libcxx/utils/ci/oss-fuzz.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/utils/ci/oss-fuzz.sh b/libcxx/utils/ci/oss-fuzz.sh
index e5723406a9ff..03b59b294041 100755
--- a/libcxx/utils/ci/oss-fuzz.sh
+++ b/libcxx/utils/ci/oss-fuzz.sh
@@ -23,7 +23,7 @@ for test in libcxx/test/libcxx/fuzzing/*.pass.cpp; do
     exe="$(basename ${test})"
     exe="${exe%.pass.cpp}"
     ${CXX} ${CXXFLAGS} \
-        -std=c++14 \
+        -std=c++20 \
         -DLIBCPP_OSS_FUZZ \
         -D_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS \
         -nostdinc++ -cxx-isystem ${INSTALL}/include/c++/v1 \
-- 
cgit v1.2.3


From 5fd9babbfcd02bae431d5b280da59adddc2824d3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 16:54:21 +0100
Subject: [X86] Rename Zn3FPP# ports -> Zn3FP#. NFC

Matches Zn4FP# (which is mostly a copy) and avoids an issue in llvm-exegesis which is terrible at choosing the right portname when they have aliases.
---
 llvm/lib/Target/X86/X86ScheduleZnver3.td           |  86 ++++----
 .../llvm-mca/X86/Znver3/dependency-breaking-gpr.s  |  72 +++----
 .../llvm-mca/X86/Znver3/mulx-hi-read-advance.s     |  24 +--
 .../tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s    |  24 +--
 .../tools/llvm-mca/X86/Znver3/mulx-read-advance.s  |  24 +--
 .../tools/llvm-mca/X86/Znver3/mulx-same-regs.s     |  24 +--
 .../tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s |  48 ++---
 .../tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s |  48 ++---
 .../tools/llvm-mca/X86/Znver3/one-idioms-mmx.s     |  36 ++--
 .../tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s |  48 ++---
 .../llvm-mca/X86/Znver3/partial-reg-update-3.s     |  12 +-
 .../llvm-mca/X86/Znver3/partial-reg-update-4.s     |  12 +-
 .../llvm-mca/X86/Znver3/partial-reg-update-5.s     |  12 +-
 .../llvm-mca/X86/Znver3/partial-reg-update-6.s     |  12 +-
 .../X86/Znver3/reg-move-elimination-avx-xmm.s      |  72 +++----
 .../X86/Znver3/reg-move-elimination-avx-ymm.s      |  72 +++----
 .../llvm-mca/X86/Znver3/reg-move-elimination-gpr.s |  48 ++---
 .../llvm-mca/X86/Znver3/reg-move-elimination-mmx.s |  12 +-
 .../X86/Znver3/reg-move-elimination-sse-xmm.s      |  72 +++----
 .../llvm-mca/X86/Znver3/reg-move-elimination-x87.s |  12 +-
 .../test/tools/llvm-mca/X86/Znver3/resources-adx.s |  12 +-
 .../test/tools/llvm-mca/X86/Znver3/resources-aes.s |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-avx1.s     |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-avx2.s     |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-bmi1.s     |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-bmi2.s     |  12 +-
 .../llvm-mca/X86/Znver3/resources-clflushopt.s     |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-clzero.s   |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-cmov.s     |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-cmpxchg.s  |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-f16c.s     |  12 +-
 .../test/tools/llvm-mca/X86/Znver3/resources-fma.s |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-fsgsbase.s |  12 +-
 .../test/tools/llvm-mca/X86/Znver3/resources-lea.s |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-lzcnt.s    |  12 +-
 .../test/tools/llvm-mca/X86/Znver3/resources-mmx.s |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-movbe.s    |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-mwaitx.s   |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-pclmul.s   |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-popcnt.s   |  12 +-
 .../llvm-mca/X86/Znver3/resources-prefetchw.s      |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-rdrand.s   |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-rdseed.s   |  12 +-
 .../test/tools/llvm-mca/X86/Znver3/resources-sha.s |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-sse1.s     |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-sse2.s     |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-sse3.s     |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-sse41.s    |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-sse42.s    |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-sse4a.s    |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-ssse3.s    |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-vaes.s     |  12 +-
 .../llvm-mca/X86/Znver3/resources-vpclmulqdq.s     |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-x86_32.s   |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-x86_64.s   |  12 +-
 .../test/tools/llvm-mca/X86/Znver3/resources-x87.s |  12 +-
 .../tools/llvm-mca/X86/Znver3/resources-xsave.s    |  12 +-
 .../llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s      | 216 +++++++++----------
 .../llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s      | 240 ++++++++++-----------
 .../tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s    |  48 ++---
 .../llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s      | 216 +++++++++----------
 61 files changed, 967 insertions(+), 967 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index d90c8bd284eb..2e87d5262818 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -52,7 +52,7 @@ def Znver3Model : SchedMachineModel {
   int VecLoadLatency = 7;
   // Latency of a simple store operation.
   int StoreLatency = 1;
-  // FIXME
+  // FIXME:
   let HighLatency = 25; // FIXME: any better choice?
   // AMD SOG 19h, 2.8 Optimizing Branching
   // The branch misprediction penalty is in the range from 11 to 18 cycles,
@@ -193,11 +193,11 @@ def Zn3Int : ProcResGroup<[Zn3ALU0, Zn3AGU0, Zn3BRU0, // scheduler 0
 // <...>, and six FPU pipes.
 // Agner, 22.10 Floating point execution pipes
 // There are six floating point/vector execution pipes,
-def Zn3FPP0  : ProcResource<1>;
-def Zn3FPP1  : ProcResource<1>;
-def Zn3FPP2  : ProcResource<1>;
-def Zn3FPP3  : ProcResource<1>;
-def Zn3FPP45 : ProcResource<2>;
+def Zn3FP0  : ProcResource<1>;
+def Zn3FP1  : ProcResource<1>;
+def Zn3FP2  : ProcResource<1>;
+def Zn3FP3  : ProcResource<1>;
+def Zn3FP45 : ProcResource<2>;
 
 //
 // Execution Units
@@ -205,63 +205,63 @@ def Zn3FPP45 : ProcResource<2>;
 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
 
 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
-defvar Zn3FPFMul0 = Zn3FPP0;
-defvar Zn3FPFMul1 = Zn3FPP1;
+defvar Zn3FPFMul0 = Zn3FP0;
+defvar Zn3FPFMul1 = Zn3FP1;
 
 // (v)FADD*
-defvar Zn3FPFAdd0 = Zn3FPP2;
-defvar Zn3FPFAdd1 = Zn3FPP3;
+defvar Zn3FPFAdd0 = Zn3FP2;
+defvar Zn3FPFAdd1 = Zn3FP3;
 
 // All convert operations except pack/unpack
-defvar Zn3FPFCvt0 = Zn3FPP2;
-defvar Zn3FPFCvt1 = Zn3FPP3;
+defvar Zn3FPFCvt0 = Zn3FP2;
+defvar Zn3FPFCvt1 = Zn3FP3;
 
 // All Divide and Square Root except Reciprocal Approximation
 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
 // FDIV unit can support 2 simultaneous operations in flight
 // even though it occupies a single pipe.
 // FIXME: BufferSize=2 ?
-defvar Zn3FPFDiv = Zn3FPP1;
+defvar Zn3FPFDiv = Zn3FP1;
 
 // Moves and Logical operations on Floating Point Data Types
-defvar Zn3FPFMisc0 = Zn3FPP0;
-defvar Zn3FPFMisc1 = Zn3FPP1;
-defvar Zn3FPFMisc2 = Zn3FPP2;
-defvar Zn3FPFMisc3 = Zn3FPP3;
+defvar Zn3FPFMisc0 = Zn3FP0;
+defvar Zn3FPFMisc1 = Zn3FP1;
+defvar Zn3FPFMisc2 = Zn3FP2;
+defvar Zn3FPFMisc3 = Zn3FP3;
 
 // Integer Adds, Subtracts, and Compares
 // Some complex VADD operations are not available in all pipes.
-defvar Zn3FPVAdd0 = Zn3FPP0;
-defvar Zn3FPVAdd1 = Zn3FPP1;
-defvar Zn3FPVAdd2 = Zn3FPP2;
-defvar Zn3FPVAdd3 = Zn3FPP3;
+defvar Zn3FPVAdd0 = Zn3FP0;
+defvar Zn3FPVAdd1 = Zn3FP1;
+defvar Zn3FPVAdd2 = Zn3FP2;
+defvar Zn3FPVAdd3 = Zn3FP3;
 
 // Integer Multiplies, SAD, Blendvb
-defvar Zn3FPVMul0 = Zn3FPP0;
-defvar Zn3FPVMul1 = Zn3FPP3;
+defvar Zn3FPVMul0 = Zn3FP0;
+defvar Zn3FPVMul1 = Zn3FP3;
 
 // Data Shuffles, Packs, Unpacks, Permute
 // Some complex shuffle operations are only available in pipe1.
-defvar Zn3FPVShuf = Zn3FPP1;
-defvar Zn3FPVShufAux = Zn3FPP2;
+defvar Zn3FPVShuf = Zn3FP1;
+defvar Zn3FPVShufAux = Zn3FP2;
 
 // Bit Shift Left/Right operations
-defvar Zn3FPVShift0 = Zn3FPP1;
-defvar Zn3FPVShift1 = Zn3FPP2;
+defvar Zn3FPVShift0 = Zn3FP1;
+defvar Zn3FPVShift1 = Zn3FP2;
 
 // Moves and Logical operations on Packed Integer Data Types
-defvar Zn3FPVMisc0 = Zn3FPP0;
-defvar Zn3FPVMisc1 = Zn3FPP1;
-defvar Zn3FPVMisc2 = Zn3FPP2;
-defvar Zn3FPVMisc3 = Zn3FPP3;
+defvar Zn3FPVMisc0 = Zn3FP0;
+defvar Zn3FPVMisc1 = Zn3FP1;
+defvar Zn3FPVMisc2 = Zn3FP2;
+defvar Zn3FPVMisc3 = Zn3FP3;
 
 // *AES*
-defvar Zn3FPAES0 = Zn3FPP0;
-defvar Zn3FPAES1 = Zn3FPP1;
+defvar Zn3FPAES0 = Zn3FP0;
+defvar Zn3FPAES1 = Zn3FP1;
 
 // *CLM*
-defvar Zn3FPCLM0 = Zn3FPP0;
-defvar Zn3FPCLM1 = Zn3FPP1;
+defvar Zn3FPCLM0 = Zn3FP0;
+defvar Zn3FPCLM1 = Zn3FP1;
 
 // Execution pipeline grouping
 //===----------------------------------------------------------------------===//
@@ -269,7 +269,7 @@ defvar Zn3FPCLM1 = Zn3FPP1;
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // Stores and floating point to general purpose register transfer
 // have 2 dedicated pipelines (pipe 5 and 6).
-def Zn3FPU0123 : ProcResGroup<[Zn3FPP0, Zn3FPP1, Zn3FPP2, Zn3FPP3]>;
+def Zn3FPU0123 : ProcResGroup<[Zn3FP0, Zn3FP1, Zn3FP2, Zn3FP3]>;
 
 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
 def Zn3FPFMul01 : ProcResGroup<[Zn3FPFMul0, Zn3FPFMul1]>;
@@ -293,12 +293,12 @@ def Zn3FPFMisc12 : ProcResGroup<[Zn3FPFMisc1, Zn3FPFMisc2]>;
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // Stores and floating point to general purpose register transfer
 // have 2 dedicated pipelines (pipe 5 and 6).
-defvar Zn3FPLd01 = Zn3FPP45;
+defvar Zn3FPLd01 = Zn3FP45;
 
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // Note that FP stores are supported on two pipelines,
 // but throughput is limited to one per cycle.
-let Super = Zn3FPP45 in
+let Super = Zn3FP45 in
 def Zn3FPSt : ProcResource<1>;
 
 // Integer Adds, Subtracts, and Compares
@@ -345,8 +345,8 @@ def Zn3FpPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 1], [0, 1, 1],
 // AMD SOG 19h, 2.11 Floating-Point Unit
 // <...> the scheduler can issue 1 micro op per cycle for each pipe.
 // FIXME: those are two separate schedulers, not a single big one.
-def Zn3FP : ProcResGroup<[Zn3FPP0, Zn3FPP2,          /*Zn3FPP4,*/ // scheduler 0
-                          Zn3FPP1, Zn3FPP3, Zn3FPP45 /*Zn3FPP5*/  // scheduler 1
+def Zn3FP : ProcResGroup<[Zn3FP0, Zn3FP2,          /*Zn3FP4,*/ // scheduler 0
+                          Zn3FP1, Zn3FP3, Zn3FP45 /*Zn3FP5*/  // scheduler 1
                          ]> {
   let BufferSize = !mul(2, 32);
 }
@@ -838,9 +838,9 @@ defm : Zn3WriteResInt<WriteZero, [Zn3ALU0123], 0, [0], 1>;
 defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
 
 // Floating point. This covers both scalar and vector operations.
-defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
-defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
-defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FPP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLD0, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 4), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLD1, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn3WriteResInt<WriteFLDC, [Zn3FPLd01, Zn3Load, Zn3FP1], !add(Znver3Model.LoadLatency, 7), [1, 1, 1], 1>;
 defm : Zn3WriteResXMM<WriteFLoad, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
 defm : Zn3WriteResXMM<WriteFLoadX, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
 defm : Zn3WriteResYMM<WriteFLoadY, [Zn3FPLd01, Zn3Load], !add(Znver3Model.VecLoadLatency, 1), [1, 1], 1>;
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
index 4654ce10dffd..349abec66457 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
@@ -68,12 +68,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -145,12 +145,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -223,12 +223,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -306,12 +306,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -389,12 +389,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -472,12 +472,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
index 12d6f399d429..0fcd6f507432 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
@@ -46,12 +46,12 @@ add %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -122,12 +122,12 @@ add %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
index 93f8d7616cb9..cd427bb5912e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
@@ -41,12 +41,12 @@ mulxq %rax, %rax, %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -112,12 +112,12 @@ mulxq %rax, %rax, %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
index 13ef5bcb11ca..bf82486cf737 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
@@ -43,12 +43,12 @@ mulxq (%rdi), %rax, %rdx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -115,12 +115,12 @@ mulxq (%rdi), %rax, %rdx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
index bfe8be85086f..8a5a0148cf58 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
@@ -44,12 +44,12 @@ mulxq %rax, %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -116,12 +116,12 @@ mulxq %rax, %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
index 1431875ade93..f0e16a8d8b99 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
@@ -68,12 +68,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
index eb2bb979b7f5..97f6a34acac1 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
@@ -68,12 +68,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
index 5909af853dd5..c733f639819b 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
@@ -63,12 +63,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -154,12 +154,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -245,12 +245,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
index 5a05487f217a..63df99e56f8c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
@@ -68,12 +68,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
index 7ac674c5a6b5..66c1322fa790 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
@@ -40,12 +40,12 @@ xor %bx, %dx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
index 582da14211d0..4ed529ee1446 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
@@ -40,12 +40,12 @@ add %cx, %bx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
index dda87e9ebc92..58941116d31f 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
@@ -33,12 +33,12 @@ lzcnt %ax, %bx  ## partial register stall.
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
index 71520ea1ce4b..fdbf4d99fbf6 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
@@ -42,12 +42,12 @@ lzcnt 2(%rsp), %cx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
index 7afa80cb0c9a..f3e515ce7e21 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
@@ -180,12 +180,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
index 8b81d55ca5db..a484a7562fdc 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
@@ -180,12 +180,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
index f359048e5298..eb20d13e8c94 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
@@ -134,12 +134,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -402,12 +402,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -670,12 +670,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -938,12 +938,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
index b556fd61f077..e17d6717e136 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
@@ -61,12 +61,12 @@ movq %mm7, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
index 147cb0fca285..b45fd172bd55 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
@@ -180,12 +180,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
index de59edf2352e..0465d413a854 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
@@ -67,12 +67,12 @@ fxch %st(0)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
index 4e024e5846f1..9c5a19b1a0a2 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
@@ -38,12 +38,12 @@ adox        (%rbx), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
index 5abf3cca9211..d1086961682d 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
@@ -50,12 +50,12 @@ aeskeygenassist $22, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
index 146b3ce62964..4f0b4843d170 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
@@ -1731,12 +1731,12 @@ vzeroupper
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
index 3c6b31a1ca01..1a8b9e2de1d8 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
@@ -771,12 +771,12 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
index 8c0e84135750..26002373e9f5 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
@@ -85,12 +85,12 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
index 8d00c99982b0..0664c1dc7eed 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
@@ -100,12 +100,12 @@ shrx        %rax, (%rbx), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
index 3e7219c9c6d0..b40d155e13f5 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
@@ -23,12 +23,12 @@ clflushopt (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
index 0dc89faad77c..0f9935c5ba14 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
@@ -23,12 +23,12 @@ clzero
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
index e0e46afd0c0b..8118e40ca5be 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
@@ -218,12 +218,12 @@ cmovgq    (%rax), %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
index 03763e5a2bfe..9ab877636b29 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
@@ -25,12 +25,12 @@ cmpxchg16b (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
index bb995d588c43..345ae02b3002 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
@@ -40,12 +40,12 @@ vcvtps2ph   $0, %ymm0, (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
index 9af180d8cb5c..af207f029c82 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
@@ -500,12 +500,12 @@ vfnmsub231ss (%rax), %xmm1, %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
index 142508c4477e..3e651837c2bd 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
@@ -40,12 +40,12 @@ wrgsbase %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
index 1545a228d9c5..02572026003c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
@@ -293,12 +293,12 @@ lea 1024(%rax, %rbx, 2), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
index ffbe414bac6c..735287a6f564 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
@@ -35,12 +35,12 @@ lzcntq      (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
index 75dbf95f4caa..2bc6177c8031 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
@@ -279,12 +279,12 @@ pxor        (%rax), %mm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
index 144e97fbaf58..6eeabbd2d7d4 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
@@ -35,12 +35,12 @@ movbe  (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
index 3b343d733425..103fd3ebf239 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
@@ -25,12 +25,12 @@ mwaitx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
index 2d9f0e9b14e2..893f47612f2f 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
@@ -25,12 +25,12 @@ pclmulqdq     $11, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
index cce078f2a8d8..29bcc5c62b4c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
@@ -35,12 +35,12 @@ popcntq     (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
index 5423b6bcff1f..b80e8f7f8b02 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
@@ -25,12 +25,12 @@ prefetchw   (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
index fb0925364449..649eb10c22af 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
@@ -27,12 +27,12 @@ rdrand   %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
index f10a90ff9bc4..44e0eebe4a2a 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
@@ -27,12 +27,12 @@ rdseed   %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
index 360a667edfaa..e6d5ab90a2ac 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
@@ -55,12 +55,12 @@ sha256rnds2 (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
index 9816b87df462..4c7a3f06d762 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
@@ -328,12 +328,12 @@ xorps       (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
index f69c535385af..d24aebfdf675 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
@@ -684,12 +684,12 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
index 8110390219c7..51bb95feb091 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
@@ -74,12 +74,12 @@ mwait
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
index 0cc6c6a5c25e..e952a169db8a 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
@@ -261,12 +261,12 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
index 873e4f406426..8afcd809b67c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
@@ -70,12 +70,12 @@ pcmpgtq     (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
index 1c1b0b24222f..6606a3ef2b2b 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
@@ -35,12 +35,12 @@ movntss     %xmm0, (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
index aeec49351a11..66688700404e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
@@ -180,12 +180,12 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
index 076094ffe195..81afc7df85ef 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
@@ -40,12 +40,12 @@ vaesenclast      (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
index 31680d562a42..10440e904b01 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
@@ -25,12 +25,12 @@ vpclmulqdq    $11, (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
index fb09b650840e..8f627ca18771 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
@@ -56,12 +56,12 @@ salc
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
index fedb3d242c45..41ec631dc3fb 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
@@ -1957,12 +1957,12 @@ xorq (%rax), %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
index 9a92bd0f2e7d..cd8a06a57f78 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
@@ -364,12 +364,12 @@ fyl2xp1
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
index 819361c0a025..f348ff8696f6 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
@@ -35,12 +35,12 @@ xsetbv
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
index 33657e6b66b8..ed4e8f90dd7b 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
@@ -138,12 +138,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -229,12 +229,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -320,12 +320,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -411,12 +411,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -502,12 +502,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -593,12 +593,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -684,12 +684,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -775,12 +775,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -866,12 +866,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -957,12 +957,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1048,12 +1048,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1139,12 +1139,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1230,12 +1230,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1321,12 +1321,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1412,12 +1412,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1503,12 +1503,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1594,12 +1594,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1685,12 +1685,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
index ba7f51eb245a..24043369c6f1 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
@@ -148,12 +148,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -239,12 +239,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -330,12 +330,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -421,12 +421,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -512,12 +512,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -603,12 +603,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -694,12 +694,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -785,12 +785,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -876,12 +876,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -967,12 +967,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1058,12 +1058,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1149,12 +1149,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1240,12 +1240,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1331,12 +1331,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1422,12 +1422,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1513,12 +1513,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1604,12 +1604,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1695,12 +1695,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1786,12 +1786,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1878,12 +1878,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
index 018adc261b08..4d648f7fb9de 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
@@ -68,12 +68,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -175,12 +175,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -282,12 +282,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -389,12 +389,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
index 935881a91421..aca39c52f36e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
@@ -138,12 +138,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -229,12 +229,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -320,12 +320,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -411,12 +411,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -502,12 +502,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -593,12 +593,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -684,12 +684,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -775,12 +775,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -866,12 +866,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -957,12 +957,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1048,12 +1048,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1139,12 +1139,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1230,12 +1230,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1321,12 +1321,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1412,12 +1412,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1503,12 +1503,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1594,12 +1594,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1685,12 +1685,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
-- 
cgit v1.2.3


From d3fe2b538d53373f03ae096ccc05d07cb4d3ff3a Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski@arm.com>
Date: Thu, 4 Apr 2024 17:12:37 +0100
Subject: Revert "[mlir][test] Make SME e2e tests require an emulator (#86489)"

This reverts commit 7b5255297dca377a37c8df066e9d9749ab96cfad.

Broken bot:
* https://lab.llvm.org/buildbot/#/builders/179/builds/9794
---
 mlir/cmake/modules/MLIRCheckHardwareFeatures.cmake | 101 ---------------------
 mlir/docs/Dialects/ArmSME.md                       |  10 +-
 mlir/test/CMakeLists.txt                           |   6 --
 3 files changed, 1 insertion(+), 116 deletions(-)
 delete mode 100644 mlir/cmake/modules/MLIRCheckHardwareFeatures.cmake

diff --git a/mlir/cmake/modules/MLIRCheckHardwareFeatures.cmake b/mlir/cmake/modules/MLIRCheckHardwareFeatures.cmake
deleted file mode 100644
index fff0424593fb..000000000000
--- a/mlir/cmake/modules/MLIRCheckHardwareFeatures.cmake
+++ /dev/null
@@ -1,101 +0,0 @@
-# A collection of helper CMake functions to detect hardware capabilities. At
-# the moment these are used when configuring MLIR integration tests.
-
-# Checks whether the specified hardware capability is supported by the host
-# Linux system. This is implemented by checking auxiliary vector feature
-# provided by the Linux kernel.
-#
-# check_hwcap(
-#   hwcap_spec
-#   output_var
-# )
-#
-# hwcap_spec - HWCAP value to check - these are defined in hwcap.h in the Linux
-#              kernel.
-#
-# output_var - Output variable to use to save the results (TRUE for supported,
-#              FALSE for not supported).
-#
-# EXAMPLES:
-#
-# check_hwcap("HWCAP2_SME" SME_EMULATOR_REQUIRED)
-#
-function(check_hwcap hwcap_spec output)
-    set(hwcap_test_src
-      [====[
-      #include <asm/hwcap.h>
-      #include <sys/auxv.h>
-
-      int main(void)
-      {
-          long hwcaps = getauxval(AT_<HWCAP_VEC>);
-          return (hwcaps & <HWCAP_SPEC>) != 0;
-      }
-      ]====]
-    )
-
-    # Extract from $hwcap_spec whether this is AT_HWCAP or AT_HWCAP2
-    string(FIND ${hwcap_spec} "_" wsloc)
-    string(SUBSTRING ${hwcap_spec} 0 ${wsloc} hwcap_vec)
-
-    string(REPLACE "<HWCAP_VEC>" ${hwcap_vec} hwcap_test_src "${hwcap_test_src}")
-    string(REPLACE "<HWCAP_SPEC>" ${hwcap_spec} hwcap_test_src "${hwcap_test_src}")
-
-    set(hwcap_test_file ${CMAKE_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/hwcap_check.c)
-    file(WRITE ${hwcap_test_file} "${hwcap_test_src}")
-
-    # Compile _and_ run
-    try_run(
-        test_run_result test_compile_result
-        ${CMAKE_BINARY_DIR}
-        ${hwcap_test_file}
-    )
-    # Compilation will fail if hwcap_spec is not defined - this usually means
-    # that your Linux kernel is too old.
-    if(${test_compile_result} AND (DEFINED test_run_result))
-      message(${test_run_result})
-      message(STATUS "Checking whether ${hwcap_spec} is supported by the host system: ${test_run_result}")
-      set(${output} ${test_run_result} PARENT_SCOPE)
-    else()
-      message(STATUS "Checking whether ${hwcap_spec} is supported by the host system: FALSE")
-    endif()
-endfunction(check_hwcap)
-
-# For the given group of e2e tests (defined by the `mlir_e2e_tests` flag),
-# checks whether an emulator is required. If yes, verifies that the
-# corresponding CMake var pointing to an emulator (`emulator_exec`) has been
-# set.
-#
-# check_emulator(
-#   mlir_e2e_tests
-#   hwcap_spec
-#   emulator_exec
-# )
-#
-# mlir_e2e_tests  - MLIR CMake variables corresponding to the group of e2e tests
-#                   to check
-# hwcap_spec      - HWCAP value to check. This should correspond to the hardware
-#                   capabilities required by the tests to be checked. Possible
-#                   values are defined in hwcap.h in the Linux kernel.
-# emulator_exec   - variable the defines the emulator (ought to be set if
-#                   required, can be empty otherwise).
-#
-# EXAMPLES:
-#
-#  check_emulator(MLIR_RUN_ARM_SVE_TESTS "HWCAP_SVE" ARM_EMULATOR_EXECUTABLE)
-#
-function(check_emulator mlir_e2e_tests hwcap_spec emulator_exec)
-  if (NOT ${mlir_e2e_tests})
-    return()
-  endif()
-
-  check_hwcap(${hwcap_spec} emulator_not_required)
-  if (${emulator_not_required})
-    return()
-  endif()
-
-  if (${emulator_exec} STREQUAL "")
-    message(FATAL_ERROR "${mlir_e2e_tests} requires an emulator, but ${emulator_exec} is not set")
-  endif()
-
-endfunction()
diff --git a/mlir/docs/Dialects/ArmSME.md b/mlir/docs/Dialects/ArmSME.md
index ce0a76ed60f1..7326150bcd11 100644
--- a/mlir/docs/Dialects/ArmSME.md
+++ b/mlir/docs/Dialects/ArmSME.md
@@ -6,7 +6,7 @@ This dialect defines custom and LLVM IR intrinsic operations that are used to
 target Arm Scalable Matrix Extension. Through the available conversion and
 ArmSME passes you can, for example, lower a
 [linalg.matmul](https://mlir.llvm.org/docs/Dialects/Linalg/#linalgmatmul-linalgmatmulop)
-operation to Arm SME
+opereation to Arm SME
 [FMOPA](https://developer.arm.com/documentation/ddi0602/2023-03/SME-Instructions/FMOPA--widening---Half-precision-floating-point-sum-of-outer-products-and-accumulate-)
 (floating-point outer product) operations. See one of the in-tree end-to-end
 integration tests for reference:
@@ -14,14 +14,6 @@ integration tests for reference:
 * [Linalg/CPU/ArmSME/matmul.mlir](https://github.com/llvm/llvm-project/blob/main/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir)
 * [Vector/CPU/ArmSME/test-outerproduct-f64.mlir](https://github.com/llvm/llvm-project/blob/main/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir)
 
-In order to run ArmSME integration tests, include these flags in the CMake
-invocation when configuring LLVM and MLIR:
-```bash
-  -DMLIR_INCLUDE_INTEGRATION_TESTS=On
-  -DMLIR_RUN_ARM_SME_TESTS=On
-  -DARM_EMULATOR_EXECUTABLE=<path-to-emulator> 
-```
-
 These tests are run "post-commit" by the
 [clang-aarch64-sve-vla](https://lab.llvm.org/buildbot/#/builders/197) LLVM
 BuildBot worker.
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 5319a9cb33e0..baf07ea1f010 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -1,5 +1,3 @@
-include(MLIRCheckHardwareFeatures)
-
 add_subdirectory(CAPI)
 add_subdirectory(lib)
 
@@ -41,10 +39,6 @@ if (MLIR_INCLUDE_INTEGRATION_TESTS)
   option(MLIR_RUN_ARM_SVE_TESTS "Run Arm SVE tests.")
   option(MLIR_RUN_ARM_SME_TESTS "Run Arm SME tests.")
 
-  # Check whether an emulator is required - if yes then make sure that it's
-  # been set.
-  check_emulator(MLIR_RUN_ARM_SVE_TESTS "HWCAP_SVE" ARM_EMULATOR_EXECUTABLE)
-  check_emulator(MLIR_RUN_ARM_SME_TESTS "HWCAP2_SME" ARM_EMULATOR_EXECUTABLE)
 
   # The native target may not be enabled when cross compiling, raise an error.
   if(NOT MLIR_ENABLE_EXECUTION_ENGINE)
-- 
cgit v1.2.3


From 62740d87bcb4a43569bc88fa76c24d27a064dcf9 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Thu, 4 Apr 2024 12:15:32 -0400
Subject: [libc][bazel][math] Add float128 math functions and their smoke tests
 to bazel layout. (#87645)

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel  |  79 +++++------
 .../libc/test/src/math/smoke/BUILD.bazel           | 147 +++++++++++++++++++++
 2 files changed, 183 insertions(+), 43 deletions(-)
 create mode 100644 utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 9dfe4c48184e..d8375de6b3c1 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1696,16 +1696,14 @@ libc_math_function(
 )
 
 libc_math_function(name = "fabs")
-
 libc_math_function(name = "fabsf")
-
 libc_math_function(name = "fabsl")
+libc_math_function(name = "fabsf128")
 
 libc_math_function(name = "fdim")
-
 libc_math_function(name = "fdimf")
-
 libc_math_function(name = "fdiml")
+libc_math_function(name = "fdimf128")
 
 libc_math_function(
     name = "ceil",
@@ -1730,6 +1728,9 @@ libc_math_function(
     ],
 )
 
+libc_math_function(name = "ceilf128")
+
+
 libc_math_function(
     name = "floor",
     specializations = [
@@ -1747,12 +1748,12 @@ libc_math_function(
 )
 
 libc_math_function(name = "floorl")
+libc_math_function(name = "floorf128")
 
 libc_math_function(name = "ldexp")
-
 libc_math_function(name = "ldexpf")
-
 libc_math_function(name = "ldexpl")
+libc_math_function(name = "ldexpf128")
 
 libc_math_function(
     name = "trunc",
@@ -1771,6 +1772,7 @@ libc_math_function(
 )
 
 libc_math_function(name = "truncl")
+libc_math_function(name = "truncf128")
 
 libc_math_function(
     name = "round",
@@ -1789,6 +1791,7 @@ libc_math_function(
 )
 
 libc_math_function(name = "roundl")
+libc_math_function(name = "roundf128")
 
 libc_math_function(
     name = "fmod",
@@ -1805,10 +1808,9 @@ libc_math_function(
 )
 
 libc_math_function(name = "frexp")
-
 libc_math_function(name = "frexpf")
-
 libc_math_function(name = "frexpl")
+libc_math_function(name = "frexpf128")
 
 libc_math_function(name = "hypot")
 
@@ -1820,40 +1822,32 @@ libc_math_function(
 )
 
 libc_math_function(name = "logb")
-
 libc_math_function(name = "logbf")
-
 libc_math_function(name = "logbl")
+libc_math_function(name = "logbf128")
 
 libc_math_function(name = "modf")
-
 libc_math_function(name = "modff")
-
 libc_math_function(name = "modfl")
+libc_math_function(name = "modff128")
 
 libc_math_function(name = "remquo")
-
 libc_math_function(name = "remquof")
-
 libc_math_function(name = "remquol")
 
 libc_math_function(name = "remainder")
-
 libc_math_function(name = "remainderf")
-
 libc_math_function(name = "remainderl")
 
 libc_math_function(name = "fmin")
-
 libc_math_function(name = "fminf")
-
 libc_math_function(name = "fminl")
+libc_math_function(name = "fminf128")
 
 libc_math_function(name = "fmax")
-
 libc_math_function(name = "fmaxf")
-
 libc_math_function(name = "fmaxl")
+libc_math_function(name = "fmaxf128")
 
 libc_math_function(
     name = "cosf",
@@ -1927,49 +1921,47 @@ libc_math_function(
     ],
 )
 
-libc_math_function(name = "copysign")
+libc_math_function(
+    name = "sqrtf128",
+    additional_deps = [
+        ":__support_fputil_sqrt",
+    ],
+)
 
+libc_math_function(name = "copysign")
 libc_math_function(name = "copysignf")
-
 libc_math_function(name = "copysignl")
-
 libc_math_function(name = "copysignf128")
 
 libc_math_function(name = "ilogb")
-
 libc_math_function(name = "ilogbf")
-
 libc_math_function(name = "ilogbl")
+libc_math_function(name = "ilogbf128")
 
 libc_math_function(name = "rint")
-
 libc_math_function(name = "rintf")
-
 libc_math_function(name = "rintl")
+libc_math_function(name = "rintf128")
 
 libc_math_function(name = "lrint")
-
 libc_math_function(name = "lrintf")
-
 libc_math_function(name = "lrintl")
+libc_math_function(name = "lrintf128")
 
 libc_math_function(name = "llrint")
-
 libc_math_function(name = "llrintf")
-
 libc_math_function(name = "llrintl")
+libc_math_function(name = "llrintf128")
 
 libc_math_function(name = "lround")
-
 libc_math_function(name = "lroundf")
-
 libc_math_function(name = "lroundl")
+libc_math_function(name = "lroundf128")
 
 libc_math_function(name = "llround")
-
 libc_math_function(name = "llroundf")
-
 libc_math_function(name = "llroundl")
+libc_math_function(name = "llroundf128")
 
 libc_math_function(
     name = "nan",
@@ -1995,28 +1987,29 @@ libc_math_function(
     ],
 )
 
-libc_math_function(name = "nearbyint")
+libc_math_function(
+    name = "nanf128",
+    additional_deps = [
+        ":__support_str_to_float",
+        ":errno",
+    ],
+)
 
+libc_math_function(name = "nearbyint")
 libc_math_function(name = "nearbyintf")
-
 libc_math_function(name = "nearbyintl")
 
 libc_math_function(name = "nextafter")
-
 libc_math_function(name = "nextafterf")
-
 libc_math_function(name = "nextafterl")
+libc_math_function(name = "nextafterf128")
 
 libc_math_function(name = "nexttoward")
-
 libc_math_function(name = "nexttowardf")
-
 libc_math_function(name = "nexttowardl")
 
 libc_math_function(name = "scalbn")
-
 libc_math_function(name = "scalbnf")
-
 libc_math_function(name = "scalbnl")
 
 ############################## inttypes targets ##############################
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel
new file mode 100644
index 000000000000..0d69a480cb28
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel
@@ -0,0 +1,147 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Smoke tests for LLVM libc math.h functions.
+
+load("//libc:libc_build_rules.bzl", "libc_support_library")
+load("//libc/test/src/math:libc_math_test_rules.bzl", "math_test")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+math_test(
+    name = "fabsf128",
+    hdrs = ["FAbsTest.h"],
+)
+
+math_test(
+    name = "ceilf128",
+    hdrs = ["CeilTest.h"],
+)
+
+math_test(
+    name = "floorf128",
+    hdrs = ["FloorTest.h"],
+)
+
+math_test(
+    name = "truncf128",
+    hdrs = ["TruncTest.h"],
+)
+
+math_test(
+    name = "roundf128",
+    hdrs = ["RoundTest.h"],
+)
+
+math_test(
+    name = "frexpf128",
+    hdrs = ["FrexpTest.h"],
+)
+
+math_test(
+    name = "logbf128",
+    hdrs = ["LogbTest.h"],
+)
+
+math_test(
+    name = "modff128",
+    hdrs = ["ModfTest.h"],
+)
+
+math_test(
+    name = "fminf128",
+    hdrs = ["FMinTest.h"],
+)
+
+math_test(
+    name = "fmaxf128",
+    hdrs = ["FMaxTest.h"],
+)
+
+math_test(
+    name = "sqrtf128",
+    hdrs = ["SqrtTest.h"],
+    deps = ["//libc:__support_cpp_bit"],
+)
+
+math_test(
+    name = "copysignf128",
+    hdrs = ["CopySignTest.h"],
+)
+
+math_test(
+    name = "ilogbf128",
+    hdrs = ["ILogbTest.h"],
+    deps = ["//libc:__support_cpp_limits"],
+)
+
+math_test(
+    name = "fdimf128",
+    hdrs = ["FDimTest.h"],
+)
+
+libc_support_library(
+    name = "ldexp_test_template",
+    hdrs = ["LdExpTest.h"],
+    deps = [
+        "//libc:__support_cpp_limits",
+        "//libc:__support_fputil_fp_bits",
+        "//libc:__support_fputil_normal_float",
+        "//libc:llvm_libc_macros_math_macros",
+        "//libc/test/UnitTest:LibcUnitTest",
+        "//libc/test/UnitTest:fp_test_helpers",
+    ],
+)
+
+math_test(
+    name = "ldexpf128",
+    hdrs = ["LdExpTest.h"],
+    deps = ["//libc:__support_cpp_limits"],
+)
+
+math_test(
+    name = "rintf128",
+    hdrs = ["RIntTest.h"],
+)
+
+math_test(
+    name = "lrintf128",
+    hdrs = ["RoundToIntegerTest.h"],
+)
+
+math_test(
+    name = "llrintf128",
+    hdrs = ["RoundToIntegerTest.h"],
+)
+math_test(
+    name = "lroundf128",
+    hdrs = ["RoundToIntegerTest.h"],
+)
+
+math_test(
+    name = "llroundf128",
+    hdrs = ["RoundToIntegerTest.h"],
+)
+
+libc_support_library(
+    name = "nextafter_test_template",
+    hdrs = ["NextAfterTest.h"],
+    deps = [
+        "//libc:__support_cpp_array",
+        "//libc:__support_cpp_bit",
+        "//libc:__support_cpp_type_traits",
+        "//libc:__support_fputil_basic_operations",
+        "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
+        "//libc/test/UnitTest:LibcUnitTest",
+        "//libc/test/UnitTest:fp_test_helpers",
+    ],
+)
+
+math_test(
+    name = "nextafterf128",
+    deps = [":nextafter_test_template"],
+)
-- 
cgit v1.2.3


From ed412494988411fc1aae2f1014c4ecad56d8085f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 17:17:47 +0100
Subject: [CostModel][X86] Update AVX1 sext v4i1 -> v4i64 cost based off worst
 case llvm-mca numbers

We were using raw instruction count which overestimated the costs for #67803
---
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 2 +-
 llvm/test/Analysis/CostModel/X86/cast.ll       | 2 +-
 llvm/test/Analysis/CostModel/X86/extend.ll     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 2092675a6924..cd6102914cc3 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2664,7 +2664,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   6 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   4 },
     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   4 },
diff --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
index d80cb09fd390..47487d6adf68 100644
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -374,7 +374,7 @@ define i32 @masks4(<4 x i1> %in) {
 ;
 ; AVX1-LABEL: 'masks4'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <4 x i1> %in to <4 x i64>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %S = sext <4 x i1> %in to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <4 x i1> %in to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'masks4'
diff --git a/llvm/test/Analysis/CostModel/X86/extend.ll b/llvm/test/Analysis/CostModel/X86/extend.ll
index 34fa3c426392..4a2585a9ddf9 100644
--- a/llvm/test/Analysis/CostModel/X86/extend.ll
+++ b/llvm/test/Analysis/CostModel/X86/extend.ll
@@ -1962,7 +1962,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-LABEL: 'sext_vXi1'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i64 = sext <16 x i1> undef to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i64 = sext <32 x i1> undef to <32 x i64>
@@ -2242,7 +2242,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-LABEL: 'sext_vXi1'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i64 = sext <16 x i1> undef to <16 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i64 = sext <32 x i1> undef to <32 x i64>
-- 
cgit v1.2.3


From 0b293e8c36d97bbd7f85ed5b67ce510ff7fd86ee Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 4 Apr 2024 17:24:16 +0100
Subject: [APInt] Remove multiplicativeInverse with explicit modulus (#87644)

All callers have been changed to use the new simpler overload with an
implicit modulus of 2^BitWidth. The old form was never used or tested
with non-power-of-two modulus anyway.
---
 llvm/include/llvm/ADT/APInt.h    |  3 ---
 llvm/lib/Support/APInt.cpp       | 49 ----------------------------------------
 llvm/unittests/ADT/APIntTest.cpp | 19 ++++------------
 3 files changed, 4 insertions(+), 67 deletions(-)

diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index bd1716219ee5..8d3c029b2e7e 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -1740,9 +1740,6 @@ public:
     return *this;
   }
 
-  /// \returns the multiplicative inverse for a given modulo.
-  APInt multiplicativeInverse(const APInt &modulo) const;
-
   /// \returns the multiplicative inverse of an odd APInt modulo 2^BitWidth.
   APInt multiplicativeInverse() const;
 
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index f8f699f8f6cc..224ea0924f0a 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -1240,55 +1240,6 @@ APInt APInt::sqrt() const {
   return x_old + 1;
 }
 
-/// Computes the multiplicative inverse of this APInt for a given modulo. The
-/// iterative extended Euclidean algorithm is used to solve for this value,
-/// however we simplify it to speed up calculating only the inverse, and take
-/// advantage of div+rem calculations. We also use some tricks to avoid copying
-/// (potentially large) APInts around.
-/// WARNING: a value of '0' may be returned,
-///          signifying that no multiplicative inverse exists!
-APInt APInt::multiplicativeInverse(const APInt& modulo) const {
-  assert(ult(modulo) && "This APInt must be smaller than the modulo");
-
-  // Using the properties listed at the following web page (accessed 06/21/08):
-  //   http://www.numbertheory.org/php/euclid.html
-  // (especially the properties numbered 3, 4 and 9) it can be proved that
-  // BitWidth bits suffice for all the computations in the algorithm implemented
-  // below. More precisely, this number of bits suffice if the multiplicative
-  // inverse exists, but may not suffice for the general extended Euclidean
-  // algorithm.
-
-  APInt r[2] = { modulo, *this };
-  APInt t[2] = { APInt(BitWidth, 0), APInt(BitWidth, 1) };
-  APInt q(BitWidth, 0);
-
-  unsigned i;
-  for (i = 0; r[i^1] != 0; i ^= 1) {
-    // An overview of the math without the confusing bit-flipping:
-    // q = r[i-2] / r[i-1]
-    // r[i] = r[i-2] % r[i-1]
-    // t[i] = t[i-2] - t[i-1] * q
-    udivrem(r[i], r[i^1], q, r[i]);
-    t[i] -= t[i^1] * q;
-  }
-
-  // If this APInt and the modulo are not coprime, there is no multiplicative
-  // inverse, so return 0. We check this by looking at the next-to-last
-  // remainder, which is the gcd(*this,modulo) as calculated by the Euclidean
-  // algorithm.
-  if (r[i] != 1)
-    return APInt(BitWidth, 0);
-
-  // The next-to-last t is the multiplicative inverse.  However, we are
-  // interested in a positive inverse. Calculate a positive one from a negative
-  // one if necessary. A simple addition of the modulo suffices because
-  // abs(t[i]) is known to be less than *this/2 (see the link above).
-  if (t[i].isNegative())
-    t[i] += modulo;
-
-  return std::move(t[i]);
-}
-
 /// \returns the multiplicative inverse of an odd APInt modulo 2^BitWidth.
 APInt APInt::multiplicativeInverse() const {
   assert((*this)[0] &&
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index 23f9ee2d39c4..76fc26412407 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -3249,22 +3249,11 @@ TEST(APIntTest, SolveQuadraticEquationWrap) {
 }
 
 TEST(APIntTest, MultiplicativeInverseExaustive) {
-  for (unsigned BitWidth = 1; BitWidth <= 16; ++BitWidth) {
-    for (unsigned Value = 0; Value < (1u << BitWidth); ++Value) {
+  for (unsigned BitWidth = 1; BitWidth <= 8; ++BitWidth) {
+    for (unsigned Value = 1; Value < (1u << BitWidth); Value += 2) {
+      // Multiplicative inverse exists for all odd numbers.
       APInt V = APInt(BitWidth, Value);
-      APInt MulInv =
-          V.zext(BitWidth + 1)
-              .multiplicativeInverse(APInt::getSignedMinValue(BitWidth + 1))
-              .trunc(BitWidth);
-      APInt One = V * MulInv;
-      if (V[0]) {
-        // Multiplicative inverse exists for all odd numbers.
-        EXPECT_TRUE(One.isOne());
-        EXPECT_TRUE((V * V.multiplicativeInverse()).isOne());
-      } else {
-        // Multiplicative inverse does not exist for even numbers (and 0).
-        EXPECT_TRUE(MulInv.isZero());
-      }
+      EXPECT_EQ(V * V.multiplicativeInverse(), 1);
     }
   }
 }
-- 
cgit v1.2.3


From 9e3b64b9f95aadf57568576712902a272fe66503 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 4 Apr 2024 09:33:18 -0700
Subject: [llvm-objcopy] Add --compress-sections

--compress-sections is similar to --compress-debug-sections but applies
to arbitrary sections.

* `--compress-sections <section>=none`: decompress sections
* `--compress-sections <section>=[zlib|zstd]`: compress sections with zlib/zstd

Like `--remove-section`, the pattern is by default a glob, but a regex
when --regex is specified.

For `--remove-section` like options, `!` prevents matches and is not
dependent on ordering (see `ELF/wildcard-syntax.test`). Since
`--compress-sections a=zlib --compress-sections a=none` naturally allows
overriding, having an order-independent `!` would be confusing.
Therefore, `!` is disallowed.

Sections within a segment are effectively immutable. Report an error for
an attempt to (de)compress them. `SHF_ALLOC` sections in a relocatable
file can be compressed, but linkers usually reject them.

Link: https://discourse.llvm.org/t/rfc-compress-arbitrary-sections-with-ld-lld-compress-sections/71674

Pull Request: https://github.com/llvm/llvm-project/pull/85036
---
 llvm/docs/CommandGuide/llvm-objcopy.rst            |   8 ++
 llvm/docs/ReleaseNotes.rst                         |   4 +
 llvm/include/llvm/ObjCopy/CommonConfig.h           |   3 +
 llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp                |  34 ++++--
 .../ELF/compress-sections-within-segment.s         |  38 ++++++
 .../tools/llvm-objcopy/ELF/compress-sections.s     | 128 +++++++++++++++++++++
 .../llvm-objcopy/ELF/decompress-sections.test      |  29 +++++
 llvm/tools/llvm-objcopy/ObjcopyOptions.cpp         |  36 ++++++
 llvm/tools/llvm-objcopy/ObjcopyOpts.td             |   6 +
 9 files changed, 278 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s
 create mode 100644 llvm/test/tools/llvm-objcopy/ELF/compress-sections.s

diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 985d16eb11cf..57d6280d57c8 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -309,6 +309,14 @@ them.
  Compress DWARF debug sections in the output, using the specified format.
  Supported formats are ``zlib`` and ``zstd``. Use ``zlib`` if ``<format>`` is omitted.
 
+.. option:: --compress-sections <section>=<format>
+
+ Compress or decompress sections matched by ``<section>`` using the specified
+ format. Supported formats are ``zlib`` and ``zstd``. Specify ``none`` for
+ decompression. When a section is matched by multiple options, the last one
+ wins. A wildcard ``<section>`` starting with '!' is disallowed.
+ Sections within a segment cannot be (de)compressed.
+
 .. option:: --decompress-debug-sections
 
  Decompress any compressed DWARF debug sections in the output.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 7588048334d7..ff7fed9ad590 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -182,6 +182,10 @@ Changes to the LLVM tools
   for ELF input to skip the specified symbols when executing other options
   that can change a symbol's name, binding or visibility.
 
+* llvm-objcopy now supports ``--compress-sections`` to compress or decompress
+  arbitrary sections not within a segment.
+  (`#85036 <https://github.com/llvm/llvm-project/pull/85036>`_.)
+
 * llvm-profgen now supports COFF+DWARF binaries. This enables Sample-based PGO
   on Windows using Intel VTune's SEP. For details on usage, see the `end-user
   documentation for SPGO
diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h
index 9d6d5fb23b18..ae08d4032736 100644
--- a/llvm/include/llvm/ObjCopy/CommonConfig.h
+++ b/llvm/include/llvm/ObjCopy/CommonConfig.h
@@ -262,6 +262,9 @@ struct CommonConfig {
   bool DecompressDebugSections = false;
 
   DebugCompressionType CompressionType = DebugCompressionType::None;
+
+  SmallVector<std::pair<NameMatcher, llvm::DebugCompressionType>, 0>
+      compressSections;
 };
 
 } // namespace objcopy
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 205bc1ef5b1a..f343d1447e05 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -215,23 +215,41 @@ static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
 }
 
 Error Object::compressOrDecompressSections(const CommonConfig &Config) {
-  // Build a list of the debug sections we are going to replace.
-  // We can't call `AddSection` while iterating over sections,
+  // Build a list of sections we are going to replace.
+  // We can't call `addSection` while iterating over sections,
   // because it would mutate the sections array.
   SmallVector<std::pair<SectionBase *, std::function<SectionBase *()>>, 0>
       ToReplace;
   for (SectionBase &Sec : sections()) {
-    if ((Sec.Flags & SHF_ALLOC) || !StringRef(Sec.Name).starts_with(".debug"))
+    std::optional<DebugCompressionType> CType;
+    for (auto &[Matcher, T] : Config.compressSections)
+      if (Matcher.matches(Sec.Name))
+        CType = T;
+    // Handle --compress-debug-sections and --decompress-debug-sections, which
+    // apply to non-ALLOC debug sections.
+    if (!(Sec.Flags & SHF_ALLOC) && StringRef(Sec.Name).starts_with(".debug")) {
+      if (Config.CompressionType != DebugCompressionType::None)
+        CType = Config.CompressionType;
+      else if (Config.DecompressDebugSections)
+        CType = DebugCompressionType::None;
+    }
+    if (!CType)
       continue;
+
+    if (Sec.ParentSegment)
+      return createStringError(
+          errc::invalid_argument,
+          "section '" + Sec.Name +
+              "' within a segment cannot be (de)compressed");
+
     if (auto *CS = dyn_cast<CompressedSection>(&Sec)) {
-      if (Config.DecompressDebugSections) {
+      if (*CType == DebugCompressionType::None)
         ToReplace.emplace_back(
             &Sec, [=] { return &addSection<DecompressedSection>(*CS); });
-      }
-    } else if (Config.CompressionType != DebugCompressionType::None) {
-      ToReplace.emplace_back(&Sec, [&, S = &Sec] {
+    } else if (*CType != DebugCompressionType::None) {
+      ToReplace.emplace_back(&Sec, [=, S = &Sec] {
         return &addSection<CompressedSection>(
-            CompressedSection(*S, Config.CompressionType, Is64Bits));
+            CompressedSection(*S, *CType, Is64Bits));
       });
     }
   }
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s b/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s
new file mode 100644
index 000000000000..064ffcadc12e
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s
@@ -0,0 +1,38 @@
+## Disallow (de)compression for sections within a segment as they are
+## effectively immutable.
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: yaml2obj %s -o a
+# RUN: not llvm-objcopy a /dev/null --compress-sections .text=zlib 2>&1 | FileCheck %s --implicit-check-not=error:
+
+# CHECK: error: 'a': section '.text' within a segment cannot be (de)compressed
+
+# RUN: not llvm-objcopy a /dev/null --compress-sections foo=none 2>&1 | FileCheck %s --check-prefix=CHECK2 --implicit-check-not=error:
+
+# CHECK2: error: 'a': section 'foo' within a segment cannot be (de)compressed
+
+## There is an error even if 'foo' is already compressed with zlib.
+# RUN: not llvm-objcopy a /dev/null --compress-sections foo=zlib 2>&1 | FileCheck %s --check-prefix=CHECK3 --implicit-check-not=error:
+
+# CHECK3: error: 'a': section 'foo' within a segment cannot be (de)compressed
+
+--- !ELF
+FileHeader:
+  Class:      ELFCLASS64
+  Data:       ELFDATA2LSB
+  Type:       ET_EXEC
+  Machine:    EM_X86_64
+ProgramHeaders:
+  - Type:     PT_LOAD
+    FirstSec: .text
+    LastSec:  foo
+    Align:    0x1000
+    Offset:   0x1000
+Sections:
+  - Name:     .text
+    Type:     SHT_PROGBITS
+    Offset:   0x1000
+    Content:  C3
+  - Name:     foo
+    Type:     SHT_PROGBITS
+    Flags:    [ SHF_COMPRESSED ]
+    Content:  010000000000000040000000000000000100000000000000789cd36280002d3269002f800151
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s b/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s
new file mode 100644
index 000000000000..e6fa86068a1a
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s
@@ -0,0 +1,128 @@
+# REQUIRES: x86-registered-target, zlib, zstd
+
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o
+## '*0=none' wins because it is the last. '*0' sections are decompressed (if originally compressed) or kept unchanged (if uncompressed).
+## No section is named 'nomatch'. The third option is a no-op.
+# RUN: llvm-objcopy a.o out --compress-sections='*0=zlib' --compress-sections '*0=none' --compress-sections 'nomatch=none' 2>&1 | count 0
+# RUN: llvm-readelf -S out | FileCheck %s --check-prefix=CHECK1
+
+# CHECK1:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK1:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
+# CHECK1:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK1-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   3  8
+# CHECK1-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK1-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   5  8
+# CHECK1:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK1-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   7  8
+# CHECK1-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK1-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
+
+## Mixing zlib and zstd.
+# RUN: llvm-objcopy a.o out2 --compress-sections '*c0=zlib' --compress-sections .debug_str=zstd
+# RUN: llvm-readelf -Sr -x nonalloc0 -x .debug_str out2 2>&1 | FileCheck %s --check-prefix=CHECK2
+# RUN: llvm-readelf -z -x nonalloc0 -x .debug_str out2 | FileCheck %s --check-prefix=CHECK2DE
+
+# CHECK2:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK2:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
+# CHECK2:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK2-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   3  8
+# CHECK2-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK2-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   5  8
+# CHECK2:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  8
+# CHECK2-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   7  8
+# CHECK2-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK2-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MSC  0   0  8
+
+## llvm-readelf -r doesn't support SHF_COMPRESSED SHT_RELA.
+# CHECK2: warning: {{.*}}: unable to read relocations from SHT_RELA section with index 8: section [index 8] has an invalid sh_size ([[#]]) which is not a multiple of its sh_entsize (24)
+
+# CHECK2:      Hex dump of section 'nonalloc0':
+## zlib with ch_size=0x10
+# CHECK2-NEXT: 01000000 00000000 10000000 00000000
+# CHECK2-NEXT: 08000000 00000000 {{.*}}
+# CHECK2:      Hex dump of section '.debug_str':
+## zstd with ch_size=0x38
+# CHECK2-NEXT: 02000000 00000000 38000000 00000000
+# CHECK2-NEXT: 01000000 00000000 {{.*}}
+
+# CHECK2DE:       Hex dump of section 'nonalloc0':
+# CHECK2DE-NEXT:  0x00000000 00000000 00000000 00000000 00000000 ................
+# CHECK2DE-EMPTY:
+# CHECK2DE-NEXT:  Hex dump of section '.debug_str':
+# CHECK2DE-NEXT:  0x00000000 41414141 41414141 41414141 41414141 AAAAAAAAAAAAAAAA
+
+## --decompress-debug-sections takes precedence, even if it is before --compress-sections.
+# RUN: llvm-objcopy a.o out3 --decompress-debug-sections --compress-sections .debug_str=zstd
+# RUN: llvm-readelf -S out3 | FileCheck %s --check-prefix=CHECK3
+
+# CHECK3:      .debug_str PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
+
+# RUN: llvm-objcopy a.o out4 --compress-sections '*0=zlib'
+# RUN: llvm-readelf -S out4 | FileCheck %s --check-prefix=CHECK4
+
+# CHECK4:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK4:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00  AX  0   0  4
+# CHECK4:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00  AC  0   0  8
+# CHECK4-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   3  8
+# CHECK4-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00   A  0   0  8
+# CHECK4-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18   I 11   5  8
+# CHECK4:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  8
+# CHECK4-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   7  8
+# CHECK4-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK4-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01  MS  0   0  1
+
+## If a section is already compressed, compression request for another format is ignored.
+# RUN: llvm-objcopy a.o out5 --compress-sections 'nonalloc0=zlib'
+# RUN: llvm-readelf -x nonalloc0 out5 | FileCheck %s --check-prefix=CHECK5
+# RUN: llvm-objcopy out5 out5a --compress-sections 'nonalloc0=zstd'
+# RUN: cmp out5 out5a
+
+# CHECK5:      Hex dump of section 'nonalloc0':
+## zlib with ch_size=0x10
+# CHECK5-NEXT: 01000000 00000000 10000000 00000000
+# CHECK5-NEXT: 08000000 00000000 {{.*}}
+
+# RUN: not llvm-objcopy --compress-sections=foo a.o out 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR1 --implicit-check-not=error:
+# ERR1:      error: --compress-sections: parse error, not 'section-glob=[none|zlib|zstd]'
+
+# RUN: llvm-objcopy --compress-sections 'a[=zlib' a.o out 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR2 --implicit-check-not=error:
+# ERR2:      warning: invalid glob pattern, unmatched '['
+
+# RUN: not llvm-objcopy a.o out --compress-sections='.debug*=zlib-gabi' --compress-sections='.debug*=' 2>&1 | \
+# RUN:   FileCheck -check-prefix=ERR3 %s
+# ERR3:      error: invalid or unsupported --compress-sections format: .debug*=zlib-gabi
+
+# RUN: not llvm-objcopy a.o out --compress-sections='!.debug*=zlib' 2>&1 | \
+# RUN:   FileCheck -check-prefix=ERR4 %s
+# ERR4:      error: --compress-sections: negative pattern is unsupported
+
+.globl _start
+_start:
+  ret
+
+.section foo0,"a"
+.balign 8
+.quad .text-.
+.quad .text-.
+.section foo1,"a"
+.balign 8
+.quad .text-.
+.quad .text-.
+.section nonalloc0,""
+.balign 8
+.quad .text+1
+.quad .text+2
+sym0:
+.section nonalloc1,""
+.balign 8
+.quad 42
+sym1:
+
+.section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+  .asciz "AAAAAAAAAAAAAAAAAAAAAAAAAAA"
+.Linfo_string1:
+  .asciz "BBBBBBBBBBBBBBBBBBBBBBBBBBB"
diff --git a/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test b/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
index 4258ddbe66a3..d9f4f3809c4d 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
@@ -4,6 +4,8 @@
 # RUN: yaml2obj %s -o %t
 # RUN: llvm-objcopy --decompress-debug-sections %t %t.de
 # RUN: llvm-readelf -S %t.de | FileCheck %s
+# RUN: llvm-objcopy --compress-sections '*nonalloc=none' --compress-sections .debugx=none %t %t.1.de
+# RUN: cmp %t.de %t.1.de
 
 # CHECK:        Name              Type            Address          Off      Size     ES Flg Lk Inf Al
 # CHECK:        .debug_alloc      PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00  AC  0   0  0
@@ -11,6 +13,33 @@
 # CHECK-NEXT:   .debugx           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
 # CHECK-NEXT:   nodebug           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  0
 
+# RUN: llvm-objcopy --compress-sections '.debug*=none' %t %t2.de
+# RUN: llvm-readelf -S -x .debug_alloc -x .debug_nonalloc -x .debugx %t2.de | FileCheck %s --check-prefix=CHECK2
+
+# CHECK2:        Name              Type            Address          Off      Size     ES Flg Lk Inf Al
+# CHECK2:        .debug_alloc      PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00  A   0   0  1
+# CHECK2-NEXT:   .debug_nonalloc   PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
+# CHECK2-NEXT:   .debugx           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
+# CHECK2-NEXT:   nodebug           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  0
+
+# CHECK2:       Hex dump of section '.debug_alloc':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-EMPTY:
+# CHECK2:       Hex dump of section '.debug_nonalloc':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-EMPTY:
+# CHECK2-NEXT:  Hex dump of section '.debugx':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 7269c51a08d6..70e85460d3df 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -736,6 +736,42 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
       return createStringError(errc::invalid_argument, Reason);
   }
 
+  for (const auto *A : InputArgs.filtered(OBJCOPY_compress_sections)) {
+    SmallVector<StringRef, 0> Fields;
+    StringRef(A->getValue()).split(Fields, '=');
+    if (Fields.size() != 2 || Fields[1].empty()) {
+      return createStringError(
+          errc::invalid_argument,
+          A->getSpelling() +
+              ": parse error, not 'section-glob=[none|zlib|zstd]'");
+    }
+
+    auto Type = StringSwitch<DebugCompressionType>(Fields[1])
+                    .Case("zlib", DebugCompressionType::Zlib)
+                    .Case("zstd", DebugCompressionType::Zstd)
+                    .Default(DebugCompressionType::None);
+    if (Type == DebugCompressionType::None && Fields[1] != "none") {
+      return createStringError(
+          errc::invalid_argument,
+          "invalid or unsupported --compress-sections format: %s",
+          A->getValue());
+    }
+
+    auto &P = Config.compressSections.emplace_back();
+    P.second = Type;
+    auto Matcher =
+        NameOrPattern::create(Fields[0], SectionMatchStyle, ErrorCallback);
+    // =none allows overriding a previous =zlib or =zstd. Reject negative
+    // patterns, which would be confusing.
+    if (Matcher && !Matcher->isPositiveMatch()) {
+      return createStringError(
+          errc::invalid_argument,
+          "--compress-sections: negative pattern is unsupported");
+    }
+    if (Error E = P.first.addMatcher(std::move(Matcher)))
+      return std::move(E);
+  }
+
   Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
   // The gnu_debuglink's target is expected to not change or else its CRC would
   // become invalidated and get rejected. We can avoid recalculating the
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index be02616e8c68..4bc80eba05f8 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -35,6 +35,12 @@ def : Flag<["--"], "compress-debug-sections">, Alias<compress_debug_sections>,
       AliasArgs<["zlib"]>;
 def decompress_debug_sections : Flag<["--"], "decompress-debug-sections">,
                                 HelpText<"Decompress DWARF debug sections">;
+defm compress_sections
+    : Eq<"compress-sections",
+         "Compress or decompress sections using specified format. Supported "
+         "formats: zlib, zstd. Specify 'none' for decompression">,
+      MetaVarName<"<section-glob>=<format>">;
+
 defm split_dwo
     : Eq<"split-dwo", "Equivalent to --extract-dwo and <dwo-file> as the output file and no other options, "
                       "and then --strip-dwo on the input file">,
-- 
cgit v1.2.3


From dcc45faa30041a3378bcde4857df205382f1996a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 4 Apr 2024 09:38:01 -0700
Subject: [ELF] PROVIDE: fix spurious "symbol not found"

When archive member extraction involving ENTRY happens after
`addScriptReferencedSymbolsToSymTable`,
`addScriptReferencedSymbolsToSymTable` may fail to define some PROVIDE
symbols used by ENTRY. This is an edge case that regressed after #84512.
(The interaction with PROVIDE and ENTRY-in-archive was not considered
before).

While here, also ensure that --undefined-glob extracted object files
are parsed before `addScriptReferencedSymbolsToSymTable`.

Fixes: ebb326a51fec37b5a47e5702e8ea157cd4f835cd

Pull Request: https://github.com/llvm/llvm-project/pull/87530
---
 lld/ELF/Driver.cpp                           | 17 ++++++++++-------
 lld/test/ELF/linkerscript/symbolreferenced.s | 25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 8dbff7fb86e7..86cc09621a91 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2767,13 +2767,6 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   // Create dynamic sections for dynamic linking and static PIE.
   config->hasDynSymTab = !ctx.sharedFiles.empty() || config->isPic;
 
-  script->addScriptReferencedSymbolsToSymTable();
-
-  // Prevent LTO from removing any definition referenced by -u.
-  for (StringRef name : config->undefined)
-    if (Defined *sym = dyn_cast_or_null<Defined>(symtab.find(name)))
-      sym->isUsedInRegularObj = true;
-
   // If an entry symbol is in a static archive, pull out that file now.
   if (Symbol *sym = symtab.find(config->entry))
     handleUndefined(sym, "--entry");
@@ -2782,6 +2775,16 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   for (StringRef pat : args::getStrings(args, OPT_undefined_glob))
     handleUndefinedGlob(pat);
 
+  // After potential archive member extraction involving ENTRY and
+  // -u/--undefined-glob, check whether PROVIDE symbols should be defined (the
+  // RHS may refer to definitions in just extracted object files).
+  script->addScriptReferencedSymbolsToSymTable();
+
+  // Prevent LTO from removing any definition referenced by -u.
+  for (StringRef name : config->undefined)
+    if (Defined *sym = dyn_cast_or_null<Defined>(symtab.find(name)))
+      sym->isUsedInRegularObj = true;
+
   // Mark -init and -fini symbols so that the LTO doesn't eliminate them.
   if (Symbol *sym = dyn_cast_or_null<Defined>(symtab.find(config->init)))
     sym->isUsedInRegularObj = true;
diff --git a/lld/test/ELF/linkerscript/symbolreferenced.s b/lld/test/ELF/linkerscript/symbolreferenced.s
index 6f583d20e276..684808269083 100644
--- a/lld/test/ELF/linkerscript/symbolreferenced.s
+++ b/lld/test/ELF/linkerscript/symbolreferenced.s
@@ -50,6 +50,21 @@
 # RUN: not ld.lld -T chain2.t a.o 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error:
 # ERR-COUNT-3: error: chain2.t:1: symbol not found: undef
 
+## _start in a lazy object file references PROVIDE symbols. We extract _start
+## earlier to avoid spurious "symbol not found" errors.
+# RUN: llvm-mc -filetype=obj -triple=x86_64 undef.s -o undef.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 start.s -o start.o
+# RUN: ld.lld -T chain2.t undef.o --start-lib start.o --end-lib -o lazy
+# RUN: llvm-nm lazy | FileCheck %s --check-prefix=LAZY
+# RUN: ld.lld -e 0 -T chain2.t --undefined-glob '_start*' undef.o --start-lib start.o --end-lib -o lazy
+# RUN: llvm-nm lazy | FileCheck %s --check-prefix=LAZY
+
+# LAZY:      T _start
+# LAZY-NEXT: t f1
+# LAZY-NEXT: T f2
+# LAZY-NEXT: T newsym
+# LAZY-NEXT: T unde
+
 #--- a.s
 .global _start
 _start:
@@ -89,3 +104,13 @@ PROVIDE(newsym = f1);
 PROVIDE(f2 = undef);
 PROVIDE_HIDDEN(f1 = f2);
 PROVIDE(newsym = f1);
+
+#--- undef.s
+.globl undef
+undef: ret
+
+#--- start.s
+.globl _start
+_start: ret
+.data
+.quad newsym
-- 
cgit v1.2.3


From be8fd86f6a57da79a4dbc8d1f4dca2e7adb1192a Mon Sep 17 00:00:00 2001
From: Gulfem Savrun Yeniceri <gulfem@google.com>
Date: Thu, 4 Apr 2024 16:36:43 +0000
Subject: Revert "[GlobalISel] Fix the infinite loop issue in
 `commute_int_constant_to_rhs`"

This reverts commit 1f01c580444ea2daef67f95ffc5fde2de5a37cec
because combine-commute-int-const-lhs.mir test failed in
multiple builders.
https://lab.llvm.org/buildbot/#/builders/124/builds/10375
https://luci-milo.appspot.com/ui/p/fuchsia/builders/prod/clang-linux-x64/b8751607530180046481/overview
---
 llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp     | 17 +++++++------
 .../GlobalISel/combine-commute-int-const-lhs.mir   | 28 ----------------------
 2 files changed, 8 insertions(+), 37 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 719209e0edd5..e53e35d98cec 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6273,15 +6273,14 @@ bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) {
 bool CombinerHelper::matchCommuteConstantToRHS(MachineInstr &MI) {
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
-  if (!getIConstantVRegVal(LHS, MRI)) {
-    // Skip commuting if LHS is not a constant. But, LHS may be a
-    // G_CONSTANT_FOLD_BARRIER. If so we commute as long as we don't already
-    // have a constant on the RHS.
-    if (MRI.getVRegDef(LHS)->getOpcode() !=
-        TargetOpcode::G_CONSTANT_FOLD_BARRIER)
-      return false;
-  }
-  // Commute as long as RHS is not a constant or G_CONSTANT_FOLD_BARRIER.
+  auto *LHSDef = MRI.getVRegDef(LHS);
+  if (getIConstantVRegVal(LHS, MRI).has_value())
+    return true;
+
+  // LHS may be a G_CONSTANT_FOLD_BARRIER. If so we commute
+  // as long as we don't already have a constant on the RHS.
+  if (LHSDef->getOpcode() != TargetOpcode::G_CONSTANT_FOLD_BARRIER)
+    return false;
   return MRI.getVRegDef(RHS)->getOpcode() !=
              TargetOpcode::G_CONSTANT_FOLD_BARRIER &&
          !getIConstantVRegVal(RHS, MRI);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir
deleted file mode 100644
index b145a6d3fd39..000000000000
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir
+++ /dev/null
@@ -1,28 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner %s -o - \
-# RUN:     --aarch64prelegalizercombiner-disable-rule=constant_fold_binop | FileCheck %s
-
-# `constant_fold_binop` is disabled to trigger the infinite loop in `commute_int_constant_to_rhs`.
-
----
-name:            add
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $s0
-
-    ; CHECK-LABEL: name: add
-    ; CHECK: liveins: $s0
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %c0:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %c1:_(s32) = G_CONSTANT i32 2
-    ; CHECK-NEXT: %add:_(s32) = G_ADD %c0, %c1
-    ; CHECK-NEXT: $s0 = COPY %add(s32)
-    ; CHECK-NEXT: RET_ReallyLR
-    %c0:_(s32) = G_CONSTANT i32 1
-    %c1:_(s32) = G_CONSTANT i32 2
-    %add:_(s32) = G_ADD %c0, %c1
-    $s0 = COPY %add
-    RET_ReallyLR
-
-...
-- 
cgit v1.2.3


From dcab42a0f9aab1d06aadc821e64af381da744819 Mon Sep 17 00:00:00 2001
From: Julian Nagele <j.nagele@apple.com>
Date: Thu, 4 Apr 2024 17:44:50 +0100
Subject: [TBAA] Test for tbaa.struct metadata with bitfields in big endian
 layout (#87617)

This test exposes what I think is invalid tbaa.struct metadata currently
generated for bitfields when using big endian layout. The regions given
by `!{i64 2, i64 4, [[META3:![0-9]+]], i64 4, i64 4 ...` are
overlapping. This issue was originally observed in
https://github.com/llvm/llvm-project/pull/86709.
---
 .../CodeGen/tbaa-struct-bitfield-endianness.cpp    | 39 ++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp

diff --git a/clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp b/clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp
new file mode 100644
index 000000000000..80884b49ddc6
--- /dev/null
+++ b/clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple aarch64_be-apple-darwin -emit-llvm -o - -O1 %s | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-BE %s
+// RUN: %clang_cc1 -triple aarch64-apple-darwin -emit-llvm -o - -O1 %s | \
+// RUN:     FileCheck -check-prefixes=CHECK,CHECK-LE %s
+//
+// Check that TBAA metadata for structs containing bitfields is
+// consistent between big and little endian layouts.
+//
+// FIXME: The metadata below is invalid for the big endian layout: the
+// start offset of 2 is incorrect.
+
+struct NamedBitfields {
+  int f1 : 8;
+  int f2 : 8;
+  unsigned f3 : 1;
+  unsigned f4 : 15;
+  int f5;
+  double f6;
+};
+
+// CHECK-LABEL: _Z4copyP14NamedBitfieldsS0_
+// CHECK-SAME: ptr nocapture noundef writeonly [[A1:%.*]], ptr nocapture noundef readonly [[A2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[A1]], ptr noundef nonnull align 8 dereferenceable(16) [[A2]], i64 16, i1 false), !tbaa.struct [[TBAA_STRUCT2:![0-9]+]]
+// CHECK-NEXT:    ret void
+//
+void copy(NamedBitfields *a1, NamedBitfields *a2) {
+  *a1 = *a2;
+}
+
+// CHECK-BE: [[TBAA_STRUCT2]] = !{i64 2, i64 4, [[META3:![0-9]+]], i64 4, i64 4, [[META6:![0-9]+]], i64 8, i64 8, [[META8:![0-9]+]]}
+// CHECK-LE: [[TBAA_STRUCT2]] = !{i64 0, i64 4, [[META3:![0-9]+]], i64 4, i64 4, [[META6:![0-9]+]], i64 8, i64 8, [[META8:![0-9]+]]}
+// CHECK: [[META3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK: [[META6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"int", [[META4]], i64 0}
+// CHECK: [[META8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK: [[META9]] = !{!"double", [[META4]], i64 0}
-- 
cgit v1.2.3


From fb2a380b5d4e483602f8cf8f36ca1df322a14a77 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 4 Apr 2024 16:46:53 +0000
Subject: [gn build] Manually port 1679b27

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 8a2ab18bf953..6c09b3611d87 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -778,12 +778,12 @@ if (current_toolchain == default_toolchain) {
       "__tree",
       "__tuple/find_index.h",
       "__tuple/make_tuple_types.h",
-      "__tuple/pair_like.h",
       "__tuple/sfinae_helpers.h",
       "__tuple/tuple_element.h",
       "__tuple/tuple_indices.h",
       "__tuple/tuple_like.h",
       "__tuple/tuple_like_ext.h",
+      "__tuple/tuple_like_no_subrange.h",
       "__tuple/tuple_size.h",
       "__tuple/tuple_types.h",
       "__type_traits/add_const.h",
-- 
cgit v1.2.3


From 338ecfbac351e4b211836ad73d75bbdf729e9134 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 4 Apr 2024 17:54:54 +0100
Subject: [libclc] Use VERSION_GREATER_EQUAL where appropriate. NFC

This was added in CMake 3.7, which might explain why it wasn't used
before.

Also reformat a couple of comments.
---
 libclc/CMakeLists.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 2d000cf5cad8..21e5cac68822 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -20,7 +20,7 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
   spirv64/lib/SOURCES
 )
 
-set( LIBCLC_MIN_LLVM "3.9.0" )
+set( LIBCLC_MIN_LLVM 3.9.0 )
 
 set( LIBCLC_TARGETS_TO_BUILD "all"
     CACHE STRING "Semicolon-separated list of targets to build, or 'all'." )
@@ -32,7 +32,7 @@ include(AddLLVM)
 
 message( STATUS "libclc LLVM version: ${LLVM_PACKAGE_VERSION}" )
 
-if( ${LLVM_PACKAGE_VERSION} VERSION_LESS ${LIBCLC_MIN_LLVM} )
+if( LLVM_PACKAGE_VERSION VERSION_LESS LIBCLC_MIN_LLVM )
   message( FATAL_ERROR "libclc needs at least LLVM ${LIBCLC_MIN_LLVM}" )
 endif()
 
@@ -66,7 +66,7 @@ set( LIBCLC_TARGETS_ALL
 )
 
 # mesa3d environment is only available since LLVM 4.0
-if( ${LLVM_PACKAGE_VERSION} VERSION_GREATER "3.9.0" )
+if( LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL 4.0.0 )
   list( APPEND LIBCLC_TARGETS_ALL amdgcn-mesa-mesa3d )
 endif()
 
@@ -102,7 +102,7 @@ set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MIN
 
 
 # LLVM 13 enables standard includes by default
-if( ${LLVM_PACKAGE_VERSION} VERSION_GREATER "12.99.99" )
+if( LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL 13.0.0 )
   set( CMAKE_LLAsm_FLAGS "${CMAKE_LLAsm_FLAGS} -cl-no-stdinc" )
   set( CMAKE_CLC_FLAGS "${CMAKE_CLC_FLAGS} -cl-no-stdinc" )
 endif()
@@ -205,7 +205,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
     list( APPEND dirs amdgpu )
   endif()
 
-  #nvptx is special
+  # nvptx is special
   if( ${ARCH} STREQUAL nvptx OR ${ARCH} STREQUAL nvptx64 )
     set( DARCH ptx )
   else()
@@ -226,8 +226,8 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
     endforeach()
   endforeach()
 
-  # Add the generated convert.cl here to prevent adding
-  # the one listed in SOURCES
+  # Add the generated convert.cl here to prevent adding the one listed in
+  # SOURCES
   if( NOT ${ARCH} STREQUAL "spirv" AND NOT ${ARCH} STREQUAL "spirv64" )
     if( NOT ENABLE_RUNTIME_SUBNORMAL AND NOT ${ARCH} STREQUAL "clspv" AND
         NOT ${ARCH} STREQUAL "clspv64" )
-- 
cgit v1.2.3


From 4167fec40768fb05b411bcefa186fe2b106ca7e4 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Thu, 4 Apr 2024 19:03:01 +0200
Subject: [libc++][chrono] Completes the tzdb class. (#82157)

It adds the missing member functions of the tzdb class and adds the free
functions that use these member functions.

Implements parts of:
- P0355 Extending <chrono> to Calendars and Time Zones
---
 libcxx/include/__chrono/tzdb.h                     | 35 +++++++++
 libcxx/include/__chrono/tzdb_list.h                | 10 +++
 libcxx/include/chrono                              |  5 ++
 libcxx/modules/std/chrono.inc                      |  4 +-
 libcxx/src/tzdb.cpp                                | 59 +++++++++++++++
 .../chrono.nodiscard_extensions.compile.pass.cpp   |  8 +++
 .../chrono.nodiscard_extensions.verify.cpp         |  8 +++
 .../time.zone.db.tzdb/locate_zone.pass.cpp         | 84 ++++++++++++++++++++++
 .../time.zone.db.access/current_zone.pass.cpp      | 77 ++++++++++++++++++++
 .../time.zone.db.access/locate_zone.pass.cpp       | 62 ++++++++++++++++
 .../time.zone.db.tzdb/current_zone.pass.cpp        | 79 ++++++++++++++++++++
 .../time.zone.db.tzdb/locate_zone.pass.cpp         | 64 +++++++++++++++++
 12 files changed, 493 insertions(+), 2 deletions(-)
 create mode 100644 libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
 create mode 100644 libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
 create mode 100644 libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
 create mode 100644 libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
 create mode 100644 libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp

diff --git a/libcxx/include/__chrono/tzdb.h b/libcxx/include/__chrono/tzdb.h
index 45c20f279f9c..e0bfedf0d782 100644
--- a/libcxx/include/__chrono/tzdb.h
+++ b/libcxx/include/__chrono/tzdb.h
@@ -16,6 +16,7 @@
 // Enable the contents of the header only when libc++ was built with experimental features enabled.
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
 
+#  include <__algorithm/ranges_lower_bound.h>
 #  include <__chrono/leap_second.h>
 #  include <__chrono/time_zone.h>
 #  include <__chrono/time_zone_link.h>
@@ -43,6 +44,40 @@ struct tzdb {
   vector<time_zone_link> links;
 
   vector<leap_second> leap_seconds;
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI const time_zone* __locate_zone(string_view __name) const {
+    if (const time_zone* __result = __find_in_zone(__name))
+      return __result;
+
+    if (auto __it = ranges::lower_bound(links, __name, {}, &time_zone_link::name);
+        __it != links.end() && __it->name() == __name)
+      if (const time_zone* __result = __find_in_zone(__it->target()))
+        return __result;
+
+    return nullptr;
+  }
+
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI const time_zone* locate_zone(string_view __name) const {
+    if (const time_zone* __result = __locate_zone(__name))
+      return __result;
+
+    std::__throw_runtime_error("tzdb: requested time zone not found");
+  }
+
+  _LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI const time_zone* current_zone() const {
+    return __current_zone();
+  }
+
+private:
+  _LIBCPP_HIDE_FROM_ABI const time_zone* __find_in_zone(string_view __name) const noexcept {
+    if (auto __it = ranges::lower_bound(zones, __name, {}, &time_zone::name);
+        __it != zones.end() && __it->name() == __name)
+      return std::addressof(*__it);
+
+    return nullptr;
+  }
+
+  [[nodiscard]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI const time_zone* __current_zone() const;
 };
 
 } // namespace chrono
diff --git a/libcxx/include/__chrono/tzdb_list.h b/libcxx/include/__chrono/tzdb_list.h
index e8aaf31e3631..693899d37211 100644
--- a/libcxx/include/__chrono/tzdb_list.h
+++ b/libcxx/include/__chrono/tzdb_list.h
@@ -17,6 +17,7 @@
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
 
 #  include <__availability>
+#  include <__chrono/time_zone.h>
 #  include <__chrono/tzdb.h>
 #  include <__config>
 #  include <__fwd/string.h>
@@ -84,6 +85,15 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI inline con
   return get_tzdb_list().front();
 }
 
+_LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI inline const time_zone*
+locate_zone(string_view __name) {
+  return get_tzdb().locate_zone(__name);
+}
+
+_LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI inline const time_zone* current_zone() {
+  return get_tzdb().current_zone();
+}
+
 _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI const tzdb& reload_tzdb();
 
 _LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI string remote_version();
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index 4dd43137b718..8fdc30a3624d 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -689,6 +689,9 @@ struct tzdb {
   vector<time_zone>      zones;
   vector<time_zone_link> links;
   vector<leap_second>    leap_seconds;
+
+  const time_zone* locate_zone(string_view tz_name) const;
+  const time_zone* current_zone() const;
 };
 
 class tzdb_list {                                                                // C++20
@@ -714,6 +717,8 @@ public:
 // [time.zone.db.access], time zone database access
 const tzdb& get_tzdb();                                                          // C++20
 tzdb_list& get_tzdb_list();                                                      // C++20
+const time_zone* locate_zone(string_view tz_name);                               // C++20
+const time_zone* current_zone()                                                  // C++20
 
 // [time.zone.db.remote], remote time zone database support
 const tzdb& reload_tzdb();                                                       // C++20
diff --git a/libcxx/modules/std/chrono.inc b/libcxx/modules/std/chrono.inc
index 2c0bd3f98a67..e14228043d3b 100644
--- a/libcxx/modules/std/chrono.inc
+++ b/libcxx/modules/std/chrono.inc
@@ -199,10 +199,10 @@ export namespace std {
     using std::chrono::tzdb_list;
 
     // [time.zone.db.access], time zone database access
-    // using std::chrono::current_zone;
+    using std::chrono::current_zone;
     using std::chrono::get_tzdb;
     using std::chrono::get_tzdb_list;
-    // using std::chrono::locate_zone;
+    using std::chrono::locate_zone;
 
     // [time.zone.db.remote], remote time zone database support
     using std::chrono::reload_tzdb;
diff --git a/libcxx/src/tzdb.cpp b/libcxx/src/tzdb.cpp
index 7ba5ceb7ada3..2c82a4a4317a 100644
--- a/libcxx/src/tzdb.cpp
+++ b/libcxx/src/tzdb.cpp
@@ -675,6 +675,57 @@ void __init_tzdb(tzdb& __tzdb, __tz::__rules_storage_type& __rules) {
   std::ranges::sort(__tzdb.leap_seconds);
 }
 
+#ifdef _WIN32
+[[nodiscard]] static const time_zone* __current_zone_windows(const tzdb& tzdb) {
+  // TODO TZDB Implement this on Windows.
+  std::__throw_runtime_error("unknown time zone");
+}
+#else  // ifdef _WIN32
+[[nodiscard]] static const time_zone* __current_zone_posix(const tzdb& tzdb) {
+  // On POSIX systems there are several ways to configure the time zone.
+  // In order of priority they are:
+  // - TZ environment variable
+  //   https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap08.html#tag_08
+  //   The documentation is unclear whether or not it's allowed to
+  //   change time zone information. For example the TZ string
+  //     MST7MDT
+  //   this is an entry in tzdata.zi. The value
+  //     MST
+  //   is also an entry. Is it allowed to use the following?
+  //     MST-3
+  //   Even when this is valid there is no time_zone record in the
+  //   database. Since the library would need to return a valid pointer,
+  //   this means the library needs to allocate and leak a pointer.
+  //
+  // - The time zone name is the target of the symlink /etc/localtime
+  //   relative to /usr/share/zoneinfo/
+
+  // The algorithm is like this:
+  // - If the environment variable TZ is set and points to a valid
+  //   record use this value.
+  // - Else use the name based on the `/etc/localtime` symlink.
+
+  if (const char* __tz = getenv("TZ"))
+    if (const time_zone* __result = tzdb.__locate_zone(__tz))
+      return __result;
+
+  filesystem::path __path = "/etc/localtime";
+  if (!std::filesystem::exists(__path))
+    std::__throw_runtime_error("tzdb: the symlink '/etc/localtime' does not exist");
+
+  if (!std::filesystem::is_symlink(__path))
+    std::__throw_runtime_error("tzdb: the path '/etc/localtime' is not a symlink");
+
+  filesystem::path __tz = filesystem::read_symlink(__path);
+  string __name         = filesystem::relative(__tz, "/usr/share/zoneinfo/");
+
+  if (const time_zone* __result = tzdb.__locate_zone(__name))
+    return __result;
+
+  std::__throw_runtime_error(("tzdb: the time zone '" + __name + "' is not found in the database").c_str());
+}
+#endif // ifdef _WIN32
+
 //===----------------------------------------------------------------------===//
 //                           Public API
 //===----------------------------------------------------------------------===//
@@ -684,6 +735,14 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI tzdb_l
   return __result;
 }
 
+[[nodiscard]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI const time_zone* tzdb::__current_zone() const {
+#ifdef _WIN32
+  return chrono::__current_zone_windows(*this);
+#else
+  return chrono::__current_zone_posix(*this);
+#endif
+}
+
 _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI const tzdb& reload_tzdb() {
   if (chrono::remote_version() == chrono::get_tzdb().version)
     return chrono::get_tzdb();
diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp
index c868832ea74a..9acb57fa05f7 100644
--- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp
+++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp
@@ -38,8 +38,16 @@ void test() {
 
   std::chrono::get_tzdb_list();
   std::chrono::get_tzdb();
+  std::chrono::locate_zone("name");
+  std::chrono::current_zone();
   std::chrono::remote_version();
 
+  {
+    const std::chrono::tzdb& t = list.front();
+    t.locate_zone("name");
+    t.current_zone();
+  }
+
   {
     tz.name();
     operator==(tz, tz);
diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp
index 4d26b46a89c9..8795a4eb3c6c 100644
--- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp
@@ -33,9 +33,17 @@ void test() {
   list.cbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   list.cend();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 
+  {
+    const std::chrono::tzdb& t = list.front();
+    t.locate_zone("name"); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+    t.current_zone();      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  }
+
   namespace crno = std::chrono;
   crno::get_tzdb_list();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   crno::get_tzdb();       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  crno::locate_zone("n"); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  crno::current_zone();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   crno::remote_version(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 
   {
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
new file mode 100644
index 000000000000..971f7f04c49a
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// struct tzdb
+
+// const time_zone* locate_zone(string_view tz_name) const;
+
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+#include "filesystem_test_helper.h"
+#include "test_tzdb.h"
+
+scoped_test_env env;
+[[maybe_unused]] const std::filesystem::path dir = env.create_dir("zoneinfo");
+const std::filesystem::path file                 = env.create_file("zoneinfo/tzdata.zi");
+
+std::string_view std::chrono::__libcpp_tzdb_directory() {
+  static std::string result = dir.string();
+  return result;
+}
+
+void write(std::string_view input) {
+  static int version = 0;
+
+  std::ofstream f{file};
+  f << "# version " << version++ << '\n';
+  f.write(input.data(), input.size());
+}
+
+static const std::chrono::tzdb& parse(std::string_view input) {
+  write(input);
+  return std::chrono::reload_tzdb();
+}
+
+int main(int, const char**) {
+  const std::chrono::tzdb& tzdb = parse(
+      R"(
+Z zone 0 r f
+L zone link
+L link link_to_link
+)");
+
+  {
+    const std::chrono::time_zone* tz = tzdb.locate_zone("zone");
+    assert(tz);
+    assert(tz->name() == "zone");
+  }
+  {
+    const std::chrono::time_zone* tz = tzdb.locate_zone("link");
+    assert(tz);
+    assert(tz->name() == "zone");
+  }
+
+  TEST_VALIDATE_EXCEPTION(
+      std::runtime_error,
+      [&]([[maybe_unused]] const std::runtime_error& e) {
+        std::string_view what{"tzdb: requested time zone not found"};
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
+      },
+      TEST_IGNORE_NODISCARD tzdb.locate_zone("link_to_link"));
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
new file mode 100644
index 000000000000..d85c8ba52622
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// const time_zone* current_zone();
+
+#include <cassert>
+#include <chrono>
+#include <string_view>
+#include <stdlib.h>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+#ifdef _WIN32
+static void set_tz(std::string zone) {
+  // Note Windows does not have setenv, only putenv
+  // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/putenv-s-wputenv-s?view=msvc-170
+  // Unlike POSIX it does not mention the string of putenv becomes part
+  // of the environment.
+
+  int status = _putenv_s("TZ", zone.c_str(), 1);
+  assert(status == 0);
+}
+
+#else
+static void set_tz(const std::string& zone) {
+  int status = setenv("TZ", zone.c_str(), 1);
+  assert(status == 0);
+}
+#endif
+
+static void test_zone(const std::string& zone) {
+  set_tz(zone);
+  const std::chrono::time_zone* tz = std::chrono::current_zone();
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_link(const std::string& link, std::string_view zone) {
+  set_tz(link);
+  const std::chrono::time_zone* tz = std::chrono::current_zone();
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+int main(int, const char**) {
+  const std::chrono::time_zone* tz = std::chrono::current_zone();
+  // Returns a valid time zone, the value depends on the OS settings.
+  assert(tz);
+  // setting the environment to an invalid value returns the value of
+  // the OS setting.
+  set_tz("This is not a time zone");
+  assert(tz == std::chrono::current_zone());
+
+  const std::chrono::tzdb& db = std::chrono::get_tzdb();
+  for (const auto& zone : db.zones)
+    test_zone(std::string{zone.name()});
+
+  for (const auto& link : db.links)
+    test_link(std::string{link.name()}, link.target());
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
new file mode 100644
index 000000000000..c3142a86bf9d
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// const time_zone* locate_zone(string_view tz_name);
+
+#include <cassert>
+#include <chrono>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+static void test_zone(std::string_view zone) {
+  const std::chrono::time_zone* tz = std::chrono::locate_zone(zone);
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_link(std::string_view link, std::string_view zone) {
+  const std::chrono::time_zone* tz = std::chrono::locate_zone(link);
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_exception([[maybe_unused]] std::string_view zone) {
+  TEST_VALIDATE_EXCEPTION(
+      std::runtime_error,
+      [&]([[maybe_unused]] const std::runtime_error& e) {
+        std::string_view what{"tzdb: requested time zone not found"};
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
+      },
+      TEST_IGNORE_NODISCARD std::chrono::locate_zone(zone));
+}
+
+int main(int, const char**) {
+  const std::chrono::tzdb& db = std::chrono::get_tzdb();
+  for (const auto& zone : db.zones)
+    test_zone(zone.name());
+
+  for (const auto& link : db.links)
+    test_link(link.name(), link.target());
+
+  test_exception("This is not a time zone");
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
new file mode 100644
index 000000000000..7b4218cc8421
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// struct tzdb
+
+// const time_zone* current_zone() const;
+
+#include <cassert>
+#include <chrono>
+#include <string_view>
+#include <stdlib.h>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+#ifdef _WIN32
+static void set_tz(std::string zone) {
+  // Note Windows does not have setenv, only putenv
+  // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/putenv-s-wputenv-s?view=msvc-170
+  // Unlike POSIX it does not mention the string of putenv becomes part
+  // of the environment.
+
+  int status = _putenv_s("TZ", zone.c_str(), 1);
+  assert(status == 0);
+}
+
+#else
+static void set_tz(const std::string& zone) {
+  int status = setenv("TZ", zone.c_str(), 1);
+  assert(status == 0);
+}
+#endif
+
+static void test_zone(const std::string& zone) {
+  set_tz(zone);
+  const std::chrono::time_zone* tz = std::chrono::get_tzdb().current_zone();
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_link(const std::string& link, std::string_view zone) {
+  set_tz(link);
+  const std::chrono::time_zone* tz = std::chrono::get_tzdb().current_zone();
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+int main(int, const char**) {
+  const std::chrono::time_zone* tz = std::chrono::get_tzdb().current_zone();
+  // Returns a valid time zone, the value depends on the OS settings.
+  assert(tz);
+  // setting the environment to an invalid value returns the value of
+  // the OS setting.
+  set_tz("This is not a time zone");
+  assert(tz == std::chrono::get_tzdb().current_zone());
+
+  const std::chrono::tzdb& db = std::chrono::get_tzdb();
+  for (const auto& zone : db.zones)
+    test_zone(std::string{zone.name()});
+
+  for (const auto& link : db.links)
+    test_link(std::string{link.name()}, link.target());
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
new file mode 100644
index 000000000000..12987f6c89d8
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// struct tzdb
+
+// const time_zone* locate_zone(string_view tz_name) const;
+
+#include <cassert>
+#include <chrono>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+static void test_zone(std::string_view zone) {
+  const std::chrono::time_zone* tz = std::chrono::get_tzdb().locate_zone(zone);
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_link(std::string_view link, std::string_view zone) {
+  const std::chrono::time_zone* tz = std::chrono::get_tzdb().locate_zone(link);
+  assert(tz);
+  assert(tz->name() == zone);
+}
+
+static void test_exception([[maybe_unused]] std::string_view zone) {
+  TEST_VALIDATE_EXCEPTION(
+      std::runtime_error,
+      [&]([[maybe_unused]] const std::runtime_error& e) {
+        std::string_view what{"tzdb: requested time zone not found"};
+        TEST_LIBCPP_REQUIRE(
+            e.what() == what,
+            TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
+      },
+      TEST_IGNORE_NODISCARD std::chrono::get_tzdb().locate_zone(zone));
+}
+
+int main(int, const char**) {
+  const std::chrono::tzdb& db = std::chrono::get_tzdb();
+  for (const auto& zone : db.zones)
+    test_zone(zone.name());
+
+  for (const auto& link : db.links)
+    test_link(link.name(), link.target());
+
+  test_exception("This is not a time zone");
+
+  return 0;
+}
-- 
cgit v1.2.3


From 0f7266a97c0443234bf18508a408e6917f6a2d46 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 4 Apr 2024 17:04:58 +0000
Subject: [gn build] Manually port 6f2d8cc0

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 +
 llvm/utils/gn/secondary/libcxx/src/BUILD.gn     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 6c09b3611d87..78a6e6fda826 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -352,6 +352,7 @@ if (current_toolchain == default_toolchain) {
       "__chrono/formatter.h",
       "__chrono/hh_mm_ss.h",
       "__chrono/high_resolution_clock.h",
+      "__chrono/leap_second.h",
       "__chrono/literals.h",
       "__chrono/month.h",
       "__chrono/month_weekday.h",
diff --git a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
index 55309720725b..90f6f5d0f145 100644
--- a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
@@ -315,6 +315,7 @@ if (libcxx_enable_experimental) {
     sources = [ "experimental/keep.cpp" ]
     if (libcxx_enable_filesystem && libcxx_enable_time_zone_database) {
       sources += [
+        "include/tzdb/leap_second_private.h",
         "include/tzdb/time_zone_link_private.h",
         "include/tzdb/time_zone_private.h",
         "include/tzdb/types_private.h",
-- 
cgit v1.2.3


From 6cf532891f9b49ccae3c7e3dfedb2a3662cdd569 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 4 Apr 2024 17:05:30 +0000
Subject: [gn build] Port 3365d6217901

---
 .../utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
index 33fdecfd948e..59dc38c8c4d8 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
@@ -25,6 +25,7 @@ static_library("readability") {
     "DeleteNullPointerCheck.cpp",
     "DuplicateIncludeCheck.cpp",
     "ElseAfterReturnCheck.cpp",
+    "EnumInitialValueCheck.cpp",
     "FunctionCognitiveComplexityCheck.cpp",
     "FunctionSizeCheck.cpp",
     "IdentifierLengthCheck.cpp",
-- 
cgit v1.2.3


From 13e75721117070d52cf3be7d88becccd5a6838b5 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 4 Apr 2024 17:05:31 +0000
Subject: [gn build] Port 8bb9443333e0

---
 llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
index 15766d4a7e34..fba81184d992 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/GlobalISel/BUILD.gn
@@ -17,6 +17,7 @@ static_library("GlobalISel") {
     "CallLowering.cpp",
     "Combiner.cpp",
     "CombinerHelper.cpp",
+    "CombinerHelperVectorOps.cpp",
     "GIMatchTableExecutor.cpp",
     "GISelChangeObserver.cpp",
     "GISelKnownBits.cpp",
-- 
cgit v1.2.3


From 258dd64978c055110373447325b9b1dd4e5268d7 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 4 Apr 2024 17:05:32 +0000
Subject: [gn build] Port fd38366e4525

---
 llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn
index 5fead24afb45..dc85fb064121 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Profile/BUILD.gn
@@ -12,7 +12,6 @@ static_library("Profile") {
     "DataAggregator.cpp",
     "DataReader.cpp",
     "Heatmap.cpp",
-    "ProfileReaderBase.cpp",
     "StaleProfileMatching.cpp",
     "YAMLProfileReader.cpp",
     "YAMLProfileWriter.cpp",
-- 
cgit v1.2.3


From f2d22b5944b3c3f77dd236d89dd419d231243a56 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 4 Apr 2024 10:09:43 -0700
Subject: [memprof] Make RecordWriterTrait a non-template class (#87604)

commit d89914f30bc7c180fe349a5aa0f03438ae6c20a4
  Author: Kazu Hirata <kazu@google.com>
  Date:   Wed Apr 3 21:48:38 2024 -0700

changed RecordWriterTrait to a template class with IndexedVersion as a
template parameter.  This patch changes the class back to a
non-template one while retaining the ability to serialize multiple
versions.

The reason I changed RecordWriterTrait to a template class was
because, even if RecordWriterTrait had IndexedVersion as a member
variable, RecordWriterTrait::EmitKeyDataLength, being a static
function, would not have access to the variable.

Since OnDiskChainedHashTableGenerator calls EmitKeyDataLength as:

  const std::pair<offset_type, offset_type> &Len =
      InfoObj.EmitKeyDataLength(Out, I->Key, I->Data);

we can make EmitKeyDataLength a member function, but we have one
problem.  InstrProfWriter::writeImpl calls:

  void insert(typename Info::key_type_ref Key,
              typename Info::data_type_ref Data) {
    Info InfoObj;
    insert(Key, Data, InfoObj);
  }

which default-constructs RecordWriterTrait without a specific version
number.  This patch fixes the problem by adjusting
InstrProfWriter::writeImpl to call the other form of insert instead:

  void insert(typename Info::key_type_ref Key,
              typename Info::data_type_ref Data, Info &InfoObj)

To prevent an accidental invocation of the default constructor of
RecordWriterTrait, this patch deletes the default constructor.
---
 llvm/include/llvm/ProfileData/MemProf.h  | 10 +++++++---
 llvm/lib/ProfileData/InstrProfWriter.cpp |  7 +++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 110e69770264..0431c182276e 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -502,7 +502,7 @@ private:
 };
 
 // Trait for writing IndexedMemProfRecord data to the on-disk hash table.
-template <IndexedVersion Version> class RecordWriterTrait {
+class RecordWriterTrait {
 public:
   using key_type = uint64_t;
   using key_type_ref = uint64_t;
@@ -517,12 +517,16 @@ public:
   // we must use a default constructor with no params for the writer trait so we
   // have a public member which must be initialized by the user.
   MemProfSchema *Schema = nullptr;
+  // The MemProf version to use for the serialization.
+  IndexedVersion Version;
 
-  RecordWriterTrait() = default;
+  // We do not support the default constructor, which does not set Version.
+  RecordWriterTrait() = delete;
+  RecordWriterTrait(IndexedVersion V) : Version(V) {}
 
   static hash_value_type ComputeHash(key_type_ref K) { return K; }
 
-  static std::pair<offset_type, offset_type>
+  std::pair<offset_type, offset_type>
   EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
     using namespace support;
 
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index a1bc180a53ca..96ab729f91e4 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -558,14 +558,13 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
     }
 
     auto RecordWriter =
-        std::make_unique<memprof::RecordWriterTrait<memprof::Version1>>();
+        std::make_unique<memprof::RecordWriterTrait>(memprof::Version1);
     RecordWriter->Schema = &Schema;
-    OnDiskChainedHashTableGenerator<
-        memprof::RecordWriterTrait<memprof::Version1>>
+    OnDiskChainedHashTableGenerator<memprof::RecordWriterTrait>
         RecordTableGenerator;
     for (auto &I : MemProfRecordData) {
       // Insert the key (func hash) and value (memprof record).
-      RecordTableGenerator.insert(I.first, I.second);
+      RecordTableGenerator.insert(I.first, I.second, *RecordWriter.get());
     }
     // Release the memory of this MapVector as it is no longer needed.
     MemProfRecordData.clear();
-- 
cgit v1.2.3


From b9ec4ab6ac4f3ae8603da8bfbf10bc0ec104e0b7 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 4 Apr 2024 10:28:36 -0700
Subject: [CMake] Install LLVMgold.so for LLVM_INSTALL_TOOLCHAIN_ONLY=on
 (#87567)

LLVMgold.so can be used with GNU ar, gold, ld, and nm to process LLVM
bitcode files. Install it in LLVM_INSTALL_TOOLCHAIN_ONLY=on builds like
we install libLTO.so.

Suggested by @emelife

Fix #84271
---
 llvm/tools/gold/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/gold/CMakeLists.txt b/llvm/tools/gold/CMakeLists.txt
index 58b323805c7d..5c78529e38f4 100644
--- a/llvm/tools/gold/CMakeLists.txt
+++ b/llvm/tools/gold/CMakeLists.txt
@@ -12,7 +12,7 @@ if( LLVM_ENABLE_PIC AND LLVM_BINUTILS_INCDIR )
      TargetParser
      )
 
-  add_llvm_library(LLVMgold MODULE
+  add_llvm_library(LLVMgold MODULE INSTALL_WITH_TOOLCHAIN
     gold-plugin.cpp
     )
 
-- 
cgit v1.2.3


From 220cdf940e953002df1521bbd061d8e0b4ffed5c Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Thu, 4 Apr 2024 13:34:46 -0400
Subject: [mlir] Add `requiresReplacedValues` and `visitReplacedValues` to
 `PromotableOpInterface` (#86792)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `requiresReplacedValues` and `visitReplacedValues` methods to
`PromotableOpInterface`. These methods allow `PromotableOpInterface` ops
to transforms definitions mutated by a `store`.

This change is necessary to correctly handle the promotion of
`LLVM_DbgDeclareOp`.

---------

Co-authored-by: Théo Degioanni <30992420+Moxinilian@users.noreply.github.com>
---
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td        |  6 +++--
 .../mlir/Interfaces/MemorySlotInterfaces.td        | 30 ++++++++++++++++++++++
 mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp      | 19 +++++++++-----
 mlir/lib/Transforms/Mem2Reg.cpp                    | 16 +++++++++++-
 mlir/test/Dialect/LLVMIR/mem2reg-dbginfo.mlir      | 21 +++++++++++++++
 5 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index 28526f1a1560..a52cca3c95de 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -562,8 +562,10 @@ class LLVM_DbgIntrOp<string name, string argName, list<Trait> traits = []>
   }];
 }
 
-def LLVM_DbgDeclareOp : LLVM_DbgIntrOp<"dbg.declare", "addr",
-    [DeclareOpInterfaceMethods<PromotableOpInterface>]> {
+def LLVM_DbgDeclareOp : LLVM_DbgIntrOp<"dbg.declare", "addr", [
+    DeclareOpInterfaceMethods<PromotableOpInterface, [
+      "requiresReplacedValues", "visitReplacedValues"
+    ]>]> {
   let summary = "Describes how the address relates to a source language variable.";
   let arguments = (ins
     LLVM_AnyPointer:$addr,
diff --git a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
index e10e2d4e104c..9db89361c780 100644
--- a/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
+++ b/mlir/include/mlir/Interfaces/MemorySlotInterfaces.td
@@ -229,6 +229,36 @@ def PromotableOpInterface : OpInterface<"PromotableOpInterface"> {
       (ins "const ::llvm::SmallPtrSetImpl<mlir::OpOperand *> &":$blockingUses,
            "::mlir::RewriterBase &":$rewriter)
     >,
+    InterfaceMethod<[{
+        This method allows the promoted operation to visit the SSA values used
+        in place of the memory slot once the promotion process of the memory
+        slot is complete.
+
+        If this method returns true, the `visitReplacedValues` method on this
+        operation will be called after the main mutation stage finishes
+        (i.e., after all ops have been processed with `removeBlockingUses`).
+
+        Operations should only the replaced values if the intended
+        transformation applies to all the replaced values. Furthermore, replaced
+        values must not be deleted.
+      }], "bool", "requiresReplacedValues", (ins), [{}],
+      [{ return false; }]
+    >,
+    InterfaceMethod<[{
+        Transforms the IR using the SSA values that replaced the memory slot.
+
+        This method will only be called after all blocking uses have been
+        scheduled for removal and if `requiresReplacedValues` returned
+        true.
+
+        The rewriter is located after the promotable operation on call. All IR
+        mutations must happen through the rewriter. During the transformation,
+        *no operation should be deleted*.
+      }],
+      "void", "visitReplacedValues",
+      (ins "::llvm::ArrayRef<std::pair<::mlir::Operation*, ::mlir::Value>>":$mutatedDefs,
+           "::mlir::RewriterBase &":$rewriter), [{}], [{ return; }]
+    >,
   ];
 }
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index 06c1fdd2eb2d..563755297c57 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -168,13 +168,6 @@ bool LLVM::StoreOp::canUsesBeRemoved(
 DeletionKind LLVM::StoreOp::removeBlockingUses(
     const MemorySlot &slot, const SmallPtrSetImpl<OpOperand *> &blockingUses,
     RewriterBase &rewriter, Value reachingDefinition) {
-  // `canUsesBeRemoved` checked this blocking use must be the stored slot
-  // pointer.
-  for (Operation *user : slot.ptr.getUsers())
-    if (auto declareOp = dyn_cast<LLVM::DbgDeclareOp>(user))
-      rewriter.create<LLVM::DbgValueOp>(declareOp->getLoc(), getValue(),
-                                        declareOp.getVarInfo(),
-                                        declareOp.getLocationExpr());
   return DeletionKind::Delete;
 }
 
@@ -407,6 +400,18 @@ DeletionKind LLVM::DbgValueOp::removeBlockingUses(
   return DeletionKind::Keep;
 }
 
+bool LLVM::DbgDeclareOp::requiresReplacedValues() { return true; }
+
+void LLVM::DbgDeclareOp::visitReplacedValues(
+    ArrayRef<std::pair<Operation *, Value>> definitions,
+    RewriterBase &rewriter) {
+  for (auto [op, value] : definitions) {
+    rewriter.setInsertionPointAfter(op);
+    rewriter.create<LLVM::DbgValueOp>(getLoc(), value, getVarInfo(),
+                                      getLocationExpr());
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Interfaces for GEPOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
index 80e3b7901632..abe565ea862f 100644
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -202,6 +202,7 @@ private:
   /// Contains the reaching definition at this operation. Reaching definitions
   /// are only computed for promotable memory operations with blocking uses.
   DenseMap<PromotableMemOpInterface, Value> reachingDefs;
+  DenseMap<PromotableMemOpInterface, Value> replacedValuesMap;
   DominanceInfo &dominance;
   MemorySlotPromotionInfo info;
   const Mem2RegStatistics &statistics;
@@ -438,6 +439,7 @@ Value MemorySlotPromoter::computeReachingDefInBlock(Block *block,
         assert(stored && "a memory operation storing to a slot must provide a "
                          "new definition of the slot");
         reachingDef = stored;
+        replacedValuesMap[memOp] = stored;
       }
     }
   }
@@ -552,6 +554,10 @@ void MemorySlotPromoter::removeBlockingUses() {
   dominanceSort(usersToRemoveUses, *slot.ptr.getParentBlock()->getParent());
 
   llvm::SmallVector<Operation *> toErase;
+  // List of all replaced values in the slot.
+  llvm::SmallVector<std::pair<Operation *, Value>> replacedValuesList;
+  // Ops to visit with the `visitReplacedValues` method.
+  llvm::SmallVector<PromotableOpInterface> toVisit;
   for (Operation *toPromote : llvm::reverse(usersToRemoveUses)) {
     if (auto toPromoteMemOp = dyn_cast<PromotableMemOpInterface>(toPromote)) {
       Value reachingDef = reachingDefs.lookup(toPromoteMemOp);
@@ -565,7 +571,9 @@ void MemorySlotPromoter::removeBlockingUses() {
               slot, info.userToBlockingUses[toPromote], rewriter,
               reachingDef) == DeletionKind::Delete)
         toErase.push_back(toPromote);
-
+      if (toPromoteMemOp.storesTo(slot))
+        if (Value replacedValue = replacedValuesMap[toPromoteMemOp])
+          replacedValuesList.push_back({toPromoteMemOp, replacedValue});
       continue;
     }
 
@@ -574,6 +582,12 @@ void MemorySlotPromoter::removeBlockingUses() {
     if (toPromoteBasic.removeBlockingUses(info.userToBlockingUses[toPromote],
                                           rewriter) == DeletionKind::Delete)
       toErase.push_back(toPromote);
+    if (toPromoteBasic.requiresReplacedValues())
+      toVisit.push_back(toPromoteBasic);
+  }
+  for (PromotableOpInterface op : toVisit) {
+    rewriter.setInsertionPointAfter(op);
+    op.visitReplacedValues(replacedValuesList, rewriter);
   }
 
   for (Operation *toEraseOp : toErase)
diff --git a/mlir/test/Dialect/LLVMIR/mem2reg-dbginfo.mlir b/mlir/test/Dialect/LLVMIR/mem2reg-dbginfo.mlir
index f7ddb4a7abe5..b7cbd787f06e 100644
--- a/mlir/test/Dialect/LLVMIR/mem2reg-dbginfo.mlir
+++ b/mlir/test/Dialect/LLVMIR/mem2reg-dbginfo.mlir
@@ -29,6 +29,27 @@ llvm.func @basic_store_load(%arg0: i64) -> i64 {
   llvm.return %2 : i64
 }
 
+// CHECK-LABEL: llvm.func @multiple_store_load
+llvm.func @multiple_store_load(%arg0: i64) -> i64 {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NOT: = llvm.alloca
+  %1 = llvm.alloca %0 x i64 {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  // CHECK-NOT: llvm.intr.dbg.declare
+  llvm.intr.dbg.declare #di_local_variable = %1 : !llvm.ptr
+  // CHECK-NOT: llvm.store
+  llvm.store %arg0, %1 {alignment = 4 : i64} : i64, !llvm.ptr
+  // CHECK-NOT: llvm.intr.dbg.declare
+  llvm.intr.dbg.declare #di_local_variable = %1 : !llvm.ptr
+  // CHECK: llvm.intr.dbg.value #[[$VAR]] = %[[LOADED:.*]] : i64
+  // CHECK: llvm.intr.dbg.value #[[$VAR]] = %[[LOADED]] : i64
+  // CHECK-NOT: llvm.intr.dbg.value
+  // CHECK-NOT: llvm.intr.dbg.declare
+  // CHECK-NOT: llvm.store
+  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i64
+  // CHECK: llvm.return %[[LOADED]] : i64
+  llvm.return %2 : i64
+}
+
 // CHECK-LABEL: llvm.func @block_argument_value
 // CHECK-SAME: (%[[ARG0:.*]]: i64, {{.*}})
 llvm.func @block_argument_value(%arg0: i64, %arg1: i1) -> i64 {
-- 
cgit v1.2.3


From 02b49d14a50cbfad0196cdddba6771f0593fdc3b Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 28 Mar 2024 14:23:58 -0500
Subject: [ValueTracking] Add tests for computing known bits from `(icmp eq
 (and/or x,y), C)`; NFC

---
 llvm/test/Transforms/InstCombine/known-bits.ll | 110 +++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 5 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index 5305c78f6912..af3db828b133 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -124,7 +124,6 @@ exit:
   ret i8 %or2
 }
 
-
 define i8 @test_cond_and_bothways(i8 %x) {
 ; CHECK-LABEL: @test_cond_and_bothways(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 91
@@ -181,8 +180,6 @@ exit:
   ret i8 %or2
 }
 
-
-
 define i8 @test_cond_and_commuted(i8 %x, i1 %c1, i1 %c2) {
 ; CHECK-LABEL: @test_cond_and_commuted(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 3
@@ -343,7 +340,7 @@ exit:
   ret i8 %or2
 }
 
-define i32 @test_icmp_trunc1(i32 %x){
+define i32 @test_icmp_trunc1(i32 %x) {
 ; CHECK-LABEL: @test_icmp_trunc1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i16
@@ -365,7 +362,7 @@ else:
   ret i32 0
 }
 
-define i32 @test_icmp_trunc_assume(i32 %x){
+define i32 @test_icmp_trunc_assume(i32 %x) {
 ; CHECK-LABEL: @test_icmp_trunc_assume(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i16
@@ -532,7 +529,110 @@ if.else:
   ret i1 %other
 }
 
+define i8 @and_eq_bits_must_be_set(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[X]], 1
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 1
+  ret i8 %r
+}
+
+define i8 @and_eq_bits_must_be_set2(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set2(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 11
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 11
+  ret i8 %r
+}
+
+define i8 @and_eq_bits_must_be_set2_partial_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set2_partial_fail(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 111
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 111
+  ret i8 %r
+}
+
+define i8 @or_eq_bits_must_be_unset(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[X]], 3
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 3
+  ret i8 %r
+}
+
+define i8 @or_eq_bits_must_be_unset2(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset2(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 1
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 1
+  ret i8 %r
+}
 
+define i8 @or_eq_bits_must_be_unset2_partial_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset2_partial_fail(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 7
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 7
+  ret i8 %r
+}
+
+define i8 @or_ne_bits_must_be_unset2_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_ne_bits_must_be_unset2_fail(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[X]], 3
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp ne i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 3
+  ret i8 %r
+}
 
 declare void @use(i1)
 declare void @sink(i8)
-- 
cgit v1.2.3


From 05cff99a29b2df17142907aa81ae488559756f01 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 28 Mar 2024 14:24:01 -0500
Subject: [ValueTracking] Infer known bits fromfrom `(icmp eq (and/or x,y), C)`

In `(icmp eq (and x,y), C)` all 1s in `C` must also be set in both
`x`/`y`.

In `(icmp eq (or x,y), C)` all 0s in `C` must also be set in both
`x`/`y`.

Closes #87143
---
 llvm/lib/Analysis/ValueTracking.cpp              | 21 +++++++++++++++------
 llvm/test/Transforms/InstCombine/known-bits.ll   | 14 +++++---------
 llvm/test/Transforms/InstCombine/zext-or-icmp.ll | 10 +++++-----
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index b5e8a1d22f26..33a69861cc3c 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -648,6 +648,7 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred,
   auto m_V =
       m_CombineOr(m_Specific(V), m_PtrToIntSameSize(Q.DL, m_Specific(V)));
 
+  Value *Y;
   const APInt *Mask, *C;
   uint64_t ShAmt;
   switch (Pred) {
@@ -656,16 +657,18 @@ static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred,
     if (match(LHS, m_V) && match(RHS, m_APInt(C))) {
       Known = Known.unionWith(KnownBits::makeConstant(*C));
       // assume(V & Mask = C)
-    } else if (match(LHS, m_And(m_V, m_APInt(Mask))) &&
+    } else if (match(LHS, m_c_And(m_V, m_Value(Y))) &&
                match(RHS, m_APInt(C))) {
       // For one bits in Mask, we can propagate bits from C to V.
-      Known.Zero |= ~*C & *Mask;
-      Known.One |= *C & *Mask;
+      Known.One |= *C;
+      if (match(Y, m_APInt(Mask)))
+        Known.Zero |= ~*C & *Mask;
       // assume(V | Mask = C)
-    } else if (match(LHS, m_Or(m_V, m_APInt(Mask))) && match(RHS, m_APInt(C))) {
+    } else if (match(LHS, m_c_Or(m_V, m_Value(Y))) && match(RHS, m_APInt(C))) {
       // For zero bits in Mask, we can propagate bits from C to V.
-      Known.Zero |= ~*C & ~*Mask;
-      Known.One |= *C & ~*Mask;
+      Known.Zero |= ~*C;
+      if (match(Y, m_APInt(Mask)))
+        Known.One |= *C & ~*Mask;
       // assume(V ^ Mask = C)
     } else if (match(LHS, m_Xor(m_V, m_APInt(Mask))) &&
                match(RHS, m_APInt(C))) {
@@ -9276,11 +9279,17 @@ void llvm::findValuesAffectedByCondition(
 
       if (ICmpInst::isEquality(Pred)) {
         if (match(B, m_ConstantInt())) {
+          Value *Y;
           // (X & C) or (X | C) or (X ^ C).
           // (X << C) or (X >>_s C) or (X >>_u C).
           if (match(A, m_BitwiseLogic(m_Value(X), m_ConstantInt())) ||
               match(A, m_Shift(m_Value(X), m_ConstantInt())))
             AddAffected(X);
+          else if (match(A, m_And(m_Value(X), m_Value(Y))) ||
+                   match(A, m_Or(m_Value(X), m_Value(Y)))) {
+            AddAffected(X);
+            AddAffected(Y);
+          }
         }
       } else {
         // Handle (A + C1) u< C2, which is the canonical form of
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index af3db828b133..769f7661fc8d 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -534,8 +534,7 @@ define i8 @and_eq_bits_must_be_set(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[X]], 1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 1
 ;
   %xy = and i8 %x, %y
   %cmp = icmp eq i8 %xy, 123
@@ -549,8 +548,7 @@ define i8 @and_eq_bits_must_be_set2(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 11
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 11
 ;
   %xy = and i8 %x, %y
   %cmp = icmp eq i8 %xy, 123
@@ -579,8 +577,7 @@ define i8 @or_eq_bits_must_be_unset(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[X]], 3
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 0
 ;
   %xy = or i8 %x, %y
   %cmp = icmp eq i8 %xy, 124
@@ -594,8 +591,7 @@ define i8 @or_eq_bits_must_be_unset2(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 0
 ;
   %xy = or i8 %x, %y
   %cmp = icmp eq i8 %xy, 124
@@ -609,7 +605,7 @@ define i8 @or_eq_bits_must_be_unset2_partial_fail(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 7
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 4
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %xy = or i8 %x, %y
diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
index 661c36038a67..a4b74aa8cc7d 100644
--- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes='instcombine<no-verify-fixpoint>' -S | FileCheck %s
 
 define i8 @zext_or_icmp_icmp(i8 %a, i8 %b) {
 ; CHECK-LABEL: @zext_or_icmp_icmp(
@@ -180,11 +180,11 @@ define i8 @PR49475_infloop(i32 %t0, i16 %insert, i64 %e, i8 %i162) {
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[SUB17]], 32
 ; CHECK-NEXT:    [[CONV18:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i64 [[XOR]], [[CONV18]]
-; CHECK-NEXT:    [[CONV19:%.*]] = zext i1 [[CMP]] to i16
-; CHECK-NEXT:    [[OR21:%.*]] = or i16 [[CONV19]], [[INSERT]]
-; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = icmp eq i16 [[OR21]], 0
+; CHECK-NEXT:    [[TRUNC44:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[TRUNC44]], [[I162]]
+; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = xor i1 [[CMP]], true
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[TOBOOL23_NOT]])
-; CHECK-NEXT:    ret i8 [[I162]]
+; CHECK-NEXT:    ret i8 [[INC]]
 ;
   %b = icmp eq i32 %t0, 0
   %b2 = icmp eq i16 %insert, 0
-- 
cgit v1.2.3


From 74447cf46f97f24f52ac6675d642e6cc771447bb Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 20 Mar 2024 23:16:28 -0500
Subject: [ValueTracking] Add tests for deducing more conditions in
 `isTruePredicate`; NFC

---
 llvm/test/Transforms/InstCombine/implies.ll  | 440 +++++++++++++++++++++++++++
 llvm/test/Transforms/InstSimplify/implies.ll |  26 ++
 2 files changed, 466 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/implies.ll

diff --git a/llvm/test/Transforms/InstCombine/implies.ll b/llvm/test/Transforms/InstCombine/implies.ll
new file mode 100644
index 000000000000..6741d59f4fcc
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/implies.ll
@@ -0,0 +1,440 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @or_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_implies_sle(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], 23
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %x, 23
+  %cond = icmp sle i8 %or, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_implies_sle_fail(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], -34
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %x, -34
+  %cond = icmp sle i8 %or, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_distjoint_implies_ule(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_distjoint_implies_ule(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 23
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_distjoint_implies_ule_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_distjoint_implies_ule_fail(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 28
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 28
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_prove_distjoin_implies_ule(i8 %xx, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_prove_distjoin_implies_ule(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], -16
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X]], 10
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x = and i8 %xx, -16
+  %x1 = or i8 %x, 7
+  %x2 = or i8 %x, 10
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_distjoint_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_or_distjoint_implies_sle(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 23
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_distjoint_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_or_distjoint_implies_sle_fail(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp slt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 23
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp sle i8 %y, %x2
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_addnsw_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_addnsw_implies_sle(
+; CHECK-NEXT:    [[X2:%.*]] = add nsw i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = add nsw i8 [[X]], 23
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = add nsw i8 %x, 23
+  %x2 = add nsw i8 %x, 24
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_addnsw_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_addnsw_implies_sle_fail(
+; CHECK-NEXT:    [[X2:%.*]] = add nsw i8 [[X:%.*]], 23
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = add nsw i8 [[X]], 24
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = add nsw i8 %x, 24
+  %x2 = add nsw i8 %x, 23
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_ult(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_ult(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[Z]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ult i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %z, %x
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_ult_fail(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_ult_fail(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Z]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ule i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %x, %z
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_slt_fail(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_slt_fail(
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp slt i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %x, %y
+  %r = icmp slt i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_or_implies_ule(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %y, %x
+  %cond = icmp uge i8 %z, %or
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_implies_false_ugt_todo(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_or_implies_false_ugt_todo(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %or = or i8 %x, %y
+  %cond = icmp ugt i8 %or, %z
+  br i1 %cond, label %T, label %F
+T:
+  ret i1 %other
+F:
+  %r = icmp ugt i8 %x, %z
+  ret i1 %r
+
+}
+
+define i1 @src_udiv_implies_ult(i8 %x, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_udiv_implies_ult(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = udiv i8 [[X]], 3
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ugt i8 %z, %x
+  br i1 %cond, label %T, label %F
+T:
+  %and = udiv i8 %x, 3
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_udiv_implies_ult2(i8 %x, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_udiv_implies_ult2(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    [[AND:%.*]] = udiv i8 [[X]], 3
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cond = icmp ule i8 %z, %x
+  br i1 %cond, label %T, label %F
+T:
+  ret i1 %other
+F:
+  %and = udiv i8 %x, 3
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+}
+
+define i1 @src_smin_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_smin_implies_sle(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp sle i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %um = call i8 @llvm.smin.i8(i8 %x, i8 %y)
+  %r = icmp sle i8 %um, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_umin_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_umin_implies_ule(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ule i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %um = call i8 @llvm.umin.i8(i8 %x, i8 %y)
+  %r = icmp ule i8 %um, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_umax_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_umax_implies_ule(
+; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[UM]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %um = call i8 @llvm.umax.i8(i8 %x, i8 %y)
+  %cond = icmp ule i8 %um, %z
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_smax_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_smax_implies_sle(
+; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[UM]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %um = call i8 @llvm.smax.i8(i8 %x, i8 %y)
+  %cond = icmp sle i8 %um, %z
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
diff --git a/llvm/test/Transforms/InstSimplify/implies.ll b/llvm/test/Transforms/InstSimplify/implies.ll
index b70dc90da655..8a011908dd38 100644
--- a/llvm/test/Transforms/InstSimplify/implies.ll
+++ b/llvm/test/Transforms/InstSimplify/implies.ll
@@ -166,6 +166,19 @@ define i1 @test10(i32 %length.i, i32 %x.full) {
   ret i1 %res
 }
 
+define i1 @test10_with_disjoint(i32 %length.i, i32 %x.full) {
+; CHECK-LABEL: @test10_with_disjoint(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = and i32 %x.full, 4294901760  ;; 4294901760 == 0xffff0000
+  %large = or disjoint i32 %x, 100
+  %small = or disjoint i32 %x, 90
+  %known = icmp ult i32 %large, %length.i
+  %to.prove = icmp ult i32 %small, %length.i
+  %res = icmp ule i1 %known, %to.prove
+  ret i1 %res
+}
+
 define i1 @test11(i32 %length.i, i32 %x) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X:%.*]], 100
@@ -227,6 +240,19 @@ define i1 @test14(i32 %length.i, i32 %x.full) {
   ret i1 %res
 }
 
+define i1 @test14_with_disjoint(i32 %length.i, i32 %x.full) {
+; CHECK-LABEL: @test14_with_disjoint(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = and i32 %x.full, 4294905615  ;; 4294905615 == 0xffff0f0f
+  %large = or disjoint i32 %x, 8224 ;; == 0x2020
+  %small = or disjoint i32 %x, 4112 ;; == 0x1010
+  %known = icmp ult i32 %large, %length.i
+  %to.prove = icmp ult i32 %small, %length.i
+  %res = icmp ule i1 %known, %to.prove
+  ret i1 %res
+}
+
 define i1 @test15(i32 %length.i, i32 %x) {
 ; CHECK-LABEL: @test15(
 ; CHECK-NEXT:    [[LARGE:%.*]] = add nuw i32 [[X:%.*]], 100
-- 
cgit v1.2.3


From 678f32ab66508aea2068a5e4e07d53b71ce5cf31 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 20 Mar 2024 22:08:22 -0500
Subject: [ValueTracking] Add more conditions in to `isTruePredicate`

There is one notable "regression". This patch replaces the bespoke `or
disjoint` logic we a direct match. This means we fail some
simplification during `instsimplify`.
All the cases we fail in `instsimplify` we do handle in `instcombine`
as we add `disjoint` flags.

Other than that, just some basic cases.

See proofs: https://alive2.llvm.org/ce/z/_-g7C8

Closes #86083
---
 llvm/lib/Analysis/ValueTracking.cpp          | 89 +++++++++++++++++-----------
 llvm/test/Transforms/InstCombine/implies.ll  | 36 ++++-------
 llvm/test/Transforms/InstSimplify/implies.ll | 16 ++++-
 3 files changed, 77 insertions(+), 64 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 33a69861cc3c..5ad4da43bca7 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8393,8 +8393,7 @@ bool llvm::matchSimpleRecurrence(const BinaryOperator *I, PHINode *&P,
 
 /// Return true if "icmp Pred LHS RHS" is always true.
 static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
-                            const Value *RHS, const DataLayout &DL,
-                            unsigned Depth) {
+                            const Value *RHS) {
   if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS)
     return true;
 
@@ -8406,8 +8405,26 @@ static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
     const APInt *C;
 
     // LHS s<= LHS +_{nsw} C   if C >= 0
-    if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))))
+    // LHS s<= LHS | C         if C >= 0
+    if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))) ||
+        match(RHS, m_Or(m_Specific(LHS), m_APInt(C))))
       return !C->isNegative();
+
+    // LHS s<= smax(LHS, V) for any V
+    if (match(RHS, m_c_SMax(m_Specific(LHS), m_Value())))
+      return true;
+
+    // smin(RHS, V) s<= RHS for any V
+    if (match(LHS, m_c_SMin(m_Specific(RHS), m_Value())))
+      return true;
+
+    // Match A to (X +_{nsw} CA) and B to (X +_{nsw} CB)
+    const Value *X;
+    const APInt *CLHS, *CRHS;
+    if (match(LHS, m_NSWAddLike(m_Value(X), m_APInt(CLHS))) &&
+        match(RHS, m_NSWAddLike(m_Specific(X), m_APInt(CRHS))))
+      return CLHS->sle(*CRHS);
+
     return false;
   }
 
@@ -8417,34 +8434,36 @@ static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
         cast<OverflowingBinaryOperator>(RHS)->hasNoUnsignedWrap())
       return true;
 
+    // LHS u<= LHS | V for any V
+    if (match(RHS, m_c_Or(m_Specific(LHS), m_Value())))
+      return true;
+
+    // LHS u<= umax(LHS, V) for any V
+    if (match(RHS, m_c_UMax(m_Specific(LHS), m_Value())))
+      return true;
+
     // RHS >> V u<= RHS for any V
     if (match(LHS, m_LShr(m_Specific(RHS), m_Value())))
       return true;
 
-    // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
-    auto MatchNUWAddsToSameValue = [&](const Value *A, const Value *B,
-                                       const Value *&X,
-                                       const APInt *&CA, const APInt *&CB) {
-      if (match(A, m_NUWAdd(m_Value(X), m_APInt(CA))) &&
-          match(B, m_NUWAdd(m_Specific(X), m_APInt(CB))))
-        return true;
+    // RHS u/ C_ugt_1 u<= RHS
+    const APInt *C;
+    if (match(LHS, m_UDiv(m_Specific(RHS), m_APInt(C))) && C->ugt(1))
+      return true;
 
-      // If X & C == 0 then (X | C) == X +_{nuw} C
-      if (match(A, m_Or(m_Value(X), m_APInt(CA))) &&
-          match(B, m_Or(m_Specific(X), m_APInt(CB)))) {
-        KnownBits Known(CA->getBitWidth());
-        computeKnownBits(X, Known, DL, Depth + 1, /*AC*/ nullptr,
-                         /*CxtI*/ nullptr, /*DT*/ nullptr);
-        if (CA->isSubsetOf(Known.Zero) && CB->isSubsetOf(Known.Zero))
-          return true;
-      }
+    // RHS & V u<= RHS for any V
+    if (match(LHS, m_c_And(m_Specific(RHS), m_Value())))
+      return true;
 
-      return false;
-    };
+    // umin(RHS, V) u<= RHS for any V
+    if (match(LHS, m_c_UMin(m_Specific(RHS), m_Value())))
+      return true;
 
+    // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
     const Value *X;
     const APInt *CLHS, *CRHS;
-    if (MatchNUWAddsToSameValue(LHS, RHS, X, CLHS, CRHS))
+    if (match(LHS, m_NUWAddLike(m_Value(X), m_APInt(CLHS))) &&
+        match(RHS, m_NUWAddLike(m_Specific(X), m_APInt(CRHS))))
       return CLHS->ule(*CRHS);
 
     return false;
@@ -8456,37 +8475,36 @@ static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
 /// ALHS ARHS" is true.  Otherwise, return std::nullopt.
 static std::optional<bool>
 isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
-                      const Value *ARHS, const Value *BLHS, const Value *BRHS,
-                      const DataLayout &DL, unsigned Depth) {
+                      const Value *ARHS, const Value *BLHS, const Value *BRHS) {
   switch (Pred) {
   default:
     return std::nullopt;
 
   case CmpInst::ICMP_SLT:
   case CmpInst::ICMP_SLE:
-    if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS) &&
+        isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS))
       return true;
     return std::nullopt;
 
   case CmpInst::ICMP_SGT:
   case CmpInst::ICMP_SGE:
-    if (isTruePredicate(CmpInst::ICMP_SLE, ALHS, BLHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_SLE, BRHS, ARHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_SLE, ALHS, BLHS) &&
+        isTruePredicate(CmpInst::ICMP_SLE, BRHS, ARHS))
       return true;
     return std::nullopt;
 
   case CmpInst::ICMP_ULT:
   case CmpInst::ICMP_ULE:
-    if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS) &&
+        isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS))
       return true;
     return std::nullopt;
 
   case CmpInst::ICMP_UGT:
   case CmpInst::ICMP_UGE:
-    if (isTruePredicate(CmpInst::ICMP_ULE, ALHS, BLHS, DL, Depth) &&
-        isTruePredicate(CmpInst::ICMP_ULE, BRHS, ARHS, DL, Depth))
+    if (isTruePredicate(CmpInst::ICMP_ULE, ALHS, BLHS) &&
+        isTruePredicate(CmpInst::ICMP_ULE, BRHS, ARHS))
       return true;
     return std::nullopt;
   }
@@ -8530,7 +8548,7 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
                                               CmpInst::Predicate RPred,
                                               const Value *R0, const Value *R1,
                                               const DataLayout &DL,
-                                              bool LHSIsTrue, unsigned Depth) {
+                                              bool LHSIsTrue) {
   Value *L0 = LHS->getOperand(0);
   Value *L1 = LHS->getOperand(1);
 
@@ -8577,7 +8595,7 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
     return LPred == RPred;
 
   if (LPred == RPred)
-    return isImpliedCondOperands(LPred, L0, L1, R0, R1, DL, Depth);
+    return isImpliedCondOperands(LPred, L0, L1, R0, R1);
 
   return std::nullopt;
 }
@@ -8639,8 +8657,7 @@ llvm::isImpliedCondition(const Value *LHS, CmpInst::Predicate RHSPred,
   // Both LHS and RHS are icmps.
   const ICmpInst *LHSCmp = dyn_cast<ICmpInst>(LHS);
   if (LHSCmp)
-    return isImpliedCondICmps(LHSCmp, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue,
-                              Depth);
+    return isImpliedCondICmps(LHSCmp, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue);
 
   /// The LHS should be an 'or', 'and', or a 'select' instruction.  We expect
   /// the RHS to be an icmp.
diff --git a/llvm/test/Transforms/InstCombine/implies.ll b/llvm/test/Transforms/InstCombine/implies.ll
index 6741d59f4fcc..c02d84d3f837 100644
--- a/llvm/test/Transforms/InstCombine/implies.ll
+++ b/llvm/test/Transforms/InstCombine/implies.ll
@@ -7,8 +7,7 @@ define i1 @or_implies_sle(i8 %x, i8 %y, i1 %other) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
 ; CHECK:       T:
-; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X]], [[Y]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       F:
 ; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
 ;
@@ -49,9 +48,7 @@ define i1 @or_distjoint_implies_ule(i8 %x, i8 %y, i1 %other) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
 ; CHECK:       T:
-; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 23
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X1]], [[Y]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       F:
 ; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
 ;
@@ -121,9 +118,7 @@ define i1 @src_or_distjoint_implies_sle(i8 %x, i8 %y, i1 %other) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
 ; CHECK:       T:
-; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 23
-; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       F:
 ; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
 ;
@@ -169,9 +164,7 @@ define i1 @src_addnsw_implies_sle(i8 %x, i8 %y, i1 %other) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
 ; CHECK:       T:
-; CHECK-NEXT:    [[X1:%.*]] = add nsw i8 [[X]], 23
-; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       F:
 ; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
 ;
@@ -216,9 +209,7 @@ define i1 @src_and_implies_ult(i8 %x, i8 %y, i8 %z, i1 %other) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i8 [[X:%.*]], [[Z:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       T:
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[Z]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[Z]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       F:
 ; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
 ;
@@ -280,8 +271,7 @@ define i1 @src_or_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
 ; CHECK:       T:
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[Z]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       F:
 ; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
 ;
@@ -322,9 +312,7 @@ define i1 @src_udiv_implies_ult(i8 %x, i8 %z, i1 %other) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       T:
-; CHECK-NEXT:    [[AND:%.*]] = udiv i8 [[X]], 3
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[AND]], [[Z]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       F:
 ; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
 ;
@@ -345,9 +333,7 @@ define i1 @src_udiv_implies_ult2(i8 %x, i8 %z, i1 %other) {
 ; CHECK:       T:
 ; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
 ; CHECK:       F:
-; CHECK-NEXT:    [[AND:%.*]] = udiv i8 [[X]], 3
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[AND]], [[Z]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cond = icmp ule i8 %z, %x
   br i1 %cond, label %T, label %F
@@ -403,8 +389,7 @@ define i1 @src_umax_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[UM]], [[Z:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
 ; CHECK:       T:
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[Z]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       F:
 ; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
 ;
@@ -424,8 +409,7 @@ define i1 @src_smax_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
 ; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[UM]], [[Z:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
 ; CHECK:       T:
-; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X]], [[Z]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       F:
 ; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
 ;
diff --git a/llvm/test/Transforms/InstSimplify/implies.ll b/llvm/test/Transforms/InstSimplify/implies.ll
index 8a011908dd38..7e3cb656bce1 100644
--- a/llvm/test/Transforms/InstSimplify/implies.ll
+++ b/llvm/test/Transforms/InstSimplify/implies.ll
@@ -155,7 +155,13 @@ define i1 @test9(i32 %length.i, i32 %i) {
 
 define i1 @test10(i32 %length.i, i32 %x.full) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[X_FULL:%.*]], -65536
+; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X]], 100
+; CHECK-NEXT:    [[SMALL:%.*]] = or i32 [[X]], 90
+; CHECK-NEXT:    [[KNOWN:%.*]] = icmp ult i32 [[LARGE]], [[LENGTH_I:%.*]]
+; CHECK-NEXT:    [[TO_PROVE:%.*]] = icmp ult i32 [[SMALL]], [[LENGTH_I]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[KNOWN]], [[TO_PROVE]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %x = and i32 %x.full, 4294901760  ;; 4294901760 == 0xffff0000
   %large = or i32 %x, 100
@@ -229,7 +235,13 @@ define i1 @test13(i32 %length.i, i32 %x) {
 
 define i1 @test14(i32 %length.i, i32 %x.full) {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[X_FULL:%.*]], -61681
+; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X]], 8224
+; CHECK-NEXT:    [[SMALL:%.*]] = or i32 [[X]], 4112
+; CHECK-NEXT:    [[KNOWN:%.*]] = icmp ult i32 [[LARGE]], [[LENGTH_I:%.*]]
+; CHECK-NEXT:    [[TO_PROVE:%.*]] = icmp ult i32 [[SMALL]], [[LENGTH_I]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[KNOWN]], [[TO_PROVE]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %x = and i32 %x.full, 4294905615  ;; 4294905615 == 0xffff0f0f
   %large = or i32 %x, 8224 ;; == 0x2020
-- 
cgit v1.2.3


From 515d3f7d62679cba178fb3603db963baa6ec8c93 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Thu, 4 Apr 2024 10:45:55 -0700
Subject: [TextAPI] Reorder addRPath parameters (#87601)

It matches up with other _attribute_ adding member functions and helps
simplify InterfaceFile assignment for InstallAPI.
---
 llvm/include/llvm/TextAPI/InterfaceFile.h  | 4 ++--
 llvm/lib/TextAPI/InterfaceFile.cpp         | 8 ++++----
 llvm/lib/TextAPI/TextStubV5.cpp            | 2 +-
 llvm/unittests/TextAPI/TextStubV5Tests.cpp | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/TextAPI/InterfaceFile.h b/llvm/include/llvm/TextAPI/InterfaceFile.h
index 10a37e3a0c2f..23c27cb0f474 100644
--- a/llvm/include/llvm/TextAPI/InterfaceFile.h
+++ b/llvm/include/llvm/TextAPI/InterfaceFile.h
@@ -299,9 +299,9 @@ public:
   }
 
   /// Set the runpath search paths.
-  /// \param InputTarget The target applicable to runpath search path.
   /// \param RPath The name of runpath.
-  void addRPath(const Target &InputTarget, StringRef RPath);
+  /// \param InputTarget The target applicable to runpath search path.
+  void addRPath(StringRef RPath, const Target &InputTarget);
 
   /// Get the list of runpath search paths.
   ///
diff --git a/llvm/lib/TextAPI/InterfaceFile.cpp b/llvm/lib/TextAPI/InterfaceFile.cpp
index 9979df92674c..79694c90370f 100644
--- a/llvm/lib/TextAPI/InterfaceFile.cpp
+++ b/llvm/lib/TextAPI/InterfaceFile.cpp
@@ -54,7 +54,7 @@ void InterfaceFile::addParentUmbrella(const Target &Target_, StringRef Parent) {
   ParentUmbrellas.emplace(Iter, Target_, std::string(Parent));
 }
 
-void InterfaceFile::addRPath(const Target &InputTarget, StringRef RPath) {
+void InterfaceFile::addRPath(StringRef RPath, const Target &InputTarget) {
   if (RPath.empty())
     return;
   using RPathEntryT = const std::pair<Target, std::string>;
@@ -198,9 +198,9 @@ InterfaceFile::merge(const InterfaceFile *O) const {
       IF->addReexportedLibrary(Lib.getInstallName(), Target);
 
   for (const auto &[Target, Path] : rpaths())
-    IF->addRPath(Target, Path);
+    IF->addRPath(Path, Target);
   for (const auto &[Target, Path] : O->rpaths())
-    IF->addRPath(Target, Path);
+    IF->addRPath(Path, Target);
 
   for (const auto *Sym : symbols()) {
     IF->addSymbol(Sym->getKind(), Sym->getName(), Sym->targets(),
@@ -319,7 +319,7 @@ InterfaceFile::extract(Architecture Arch) const {
 
   for (const auto &It : rpaths())
     if (It.first.Arch == Arch)
-      IF->addRPath(It.first, It.second);
+      IF->addRPath(It.second, It.first);
 
   for (const auto &Lib : allowableClients())
     for (const auto &Target : Lib.targets())
diff --git a/llvm/lib/TextAPI/TextStubV5.cpp b/llvm/lib/TextAPI/TextStubV5.cpp
index d96981035ddd..b072c0b5d69d 100644
--- a/llvm/lib/TextAPI/TextStubV5.cpp
+++ b/llvm/lib/TextAPI/TextStubV5.cpp
@@ -672,7 +672,7 @@ Expected<IFPtr> parseToInterfaceFile(const Object *File) {
       F->addParentUmbrella(Target, Lib);
   for (auto &[Path, Targets] : RPaths)
     for (auto Target : Targets)
-      F->addRPath(Target, Path);
+      F->addRPath(Path, Target);
   for (auto &[Targets, Symbols] : Exports)
     for (auto &Sym : Symbols)
       F->addSymbol(Sym.Kind, Sym.Name, Targets, Sym.Flags);
diff --git a/llvm/unittests/TextAPI/TextStubV5Tests.cpp b/llvm/unittests/TextAPI/TextStubV5Tests.cpp
index c77d13ef8f23..62fdd79ae497 100644
--- a/llvm/unittests/TextAPI/TextStubV5Tests.cpp
+++ b/llvm/unittests/TextAPI/TextStubV5Tests.cpp
@@ -722,7 +722,7 @@ TEST(TBDv5, WriteFile) {
   File.setInstallName("@rpath/S/L/F/Foo.framework/Foo");
   File.setCurrentVersion(PackedVersion(1, 2, 0));
   File.setCompatibilityVersion(PackedVersion(1, 1, 0));
-  File.addRPath(AllTargets[0], "@executable_path/.../Frameworks");
+  File.addRPath("@executable_path/.../Frameworks", AllTargets[0]);
 
   for (const auto &Targ : AllTargets) {
     File.addParentUmbrella(Targ, "System");
@@ -897,7 +897,7 @@ TEST(TBDv5, WriteMultipleDocuments) {
   NestedFile.setTwoLevelNamespace();
   NestedFile.setApplicationExtensionSafe(false);
   NestedFile.setCurrentVersion(PackedVersion(2, 1, 1));
-  NestedFile.addRPath(AllTargets[0], "@executable_path/.../Frameworks");
+  NestedFile.addRPath("@executable_path/.../Frameworks", AllTargets[0]);
   for (const auto &Targ : AllTargets)
     NestedFile.addReexportedLibrary("@rpath/libfoo.dylib", Targ);
   NestedFile.addSymbol(EncodeKind::GlobalSymbol, "_funcFoo", AllTargets,
-- 
cgit v1.2.3


From d97d560fbf6ed26a198b3afe1594d7d63b88ab3a Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev@accesssoftek.com>
Date: Thu, 4 Apr 2024 21:05:03 +0300
Subject: [AArch64][PAC][MC][ELF] Support PAuth ABI compatibility tag (#85236)

Depends on #87545

Emit `GNU_PROPERTY_AARCH64_FEATURE_PAUTH` property in
`.note.gnu.property` section depending on
`aarch64-elf-pauthabi-platform` and `aarch64-elf-pauthabi-version` llvm
module flags.
---
 llvm/lib/IR/Verifier.cpp                           | 22 +++++++++-
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp      | 12 ++++--
 .../AArch64/MCTargetDesc/AArch64TargetStreamer.cpp | 33 +++++++++++---
 .../AArch64/MCTargetDesc/AArch64TargetStreamer.h   |  3 +-
 .../AArch64/note-gnu-property-elf-pauthabi.ll      | 50 ++++++++++++++++++++++
 .../module-flags-note-gnu-property-elf-pauthabi.ll | 19 ++++++++
 6 files changed, 127 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll
 create mode 100644 llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index ba0b72312702..64c59914cf2f 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1734,8 +1734,28 @@ void Verifier::visitModuleFlags() {
   // Scan each flag, and track the flags and requirements.
   DenseMap<const MDString*, const MDNode*> SeenIDs;
   SmallVector<const MDNode*, 16> Requirements;
-  for (const MDNode *MDN : Flags->operands())
+  uint64_t PAuthABIPlatform = -1;
+  uint64_t PAuthABIVersion = -1;
+  for (const MDNode *MDN : Flags->operands()) {
     visitModuleFlag(MDN, SeenIDs, Requirements);
+    if (MDN->getNumOperands() != 3)
+      continue;
+    if (const auto *FlagName = dyn_cast_or_null<MDString>(MDN->getOperand(1))) {
+      if (FlagName->getString() == "aarch64-elf-pauthabi-platform") {
+        if (const auto *PAP =
+                mdconst::dyn_extract_or_null<ConstantInt>(MDN->getOperand(2)))
+          PAuthABIPlatform = PAP->getZExtValue();
+      } else if (FlagName->getString() == "aarch64-elf-pauthabi-version") {
+        if (const auto *PAV =
+                mdconst::dyn_extract_or_null<ConstantInt>(MDN->getOperand(2)))
+          PAuthABIVersion = PAV->getZExtValue();
+      }
+    }
+  }
+
+  if ((PAuthABIPlatform == uint64_t(-1)) != (PAuthABIVersion == uint64_t(-1)))
+    CheckFailed("either both or no 'aarch64-elf-pauthabi-platform' and "
+                "'aarch64-elf-pauthabi-version' module flags must be present");
 
   // Validate that the requirements in the module are valid.
   for (const MDNode *Requirement : Requirements) {
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 4fa719ad67cf..f6ccd0ecfdc8 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -268,13 +268,19 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
     if (Sign->getZExtValue())
       Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
 
-  if (Flags == 0)
-    return;
+  uint64_t PAuthABIPlatform = -1;
+  if (const auto *PAP = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("aarch64-elf-pauthabi-platform")))
+    PAuthABIPlatform = PAP->getZExtValue();
+  uint64_t PAuthABIVersion = -1;
+  if (const auto *PAV = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("aarch64-elf-pauthabi-version")))
+    PAuthABIVersion = PAV->getZExtValue();
 
   // Emit a .note.gnu.property section with the flags.
   auto *TS =
       static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
-  TS->emitNoteSection(Flags);
+  TS->emitNoteSection(Flags, PAuthABIPlatform, PAuthABIVersion);
 }
 
 void AArch64AsmPrinter::emitFunctionHeaderComment() {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index e1d6dd7a056b..dc5383ce941e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -58,8 +58,17 @@ void AArch64TargetStreamer::finish() {
     emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
 }
 
-void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
-  if (Flags == 0)
+void AArch64TargetStreamer::emitNoteSection(unsigned Flags,
+                                            uint64_t PAuthABIPlatform,
+                                            uint64_t PAuthABIVersion) {
+  assert((PAuthABIPlatform == uint64_t(-1)) ==
+         (PAuthABIVersion == uint64_t(-1)));
+  uint64_t DescSz = 0;
+  if (Flags != 0)
+    DescSz += 4 * 4;
+  if (PAuthABIPlatform != uint64_t(-1))
+    DescSz += 4 + 4 + 8 * 2;
+  if (DescSz == 0)
     return;
 
   MCStreamer &OutStreamer = getStreamer();
@@ -80,15 +89,25 @@ void AArch64TargetStreamer::emitNoteSection(unsigned Flags) {
   // Emit the note header.
   OutStreamer.emitValueToAlignment(Align(8));
   OutStreamer.emitIntValue(4, 4);     // data size for "GNU\0"
-  OutStreamer.emitIntValue(4 * 4, 4); // Elf_Prop size
+  OutStreamer.emitIntValue(DescSz, 4); // Elf_Prop array size
   OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4);
   OutStreamer.emitBytes(StringRef("GNU", 4)); // note name
 
   // Emit the PAC/BTI properties.
-  OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
-  OutStreamer.emitIntValue(4, 4);     // data size
-  OutStreamer.emitIntValue(Flags, 4); // data
-  OutStreamer.emitIntValue(0, 4);     // pad
+  if (Flags != 0) {
+    OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
+    OutStreamer.emitIntValue(4, 4);     // data size
+    OutStreamer.emitIntValue(Flags, 4); // data
+    OutStreamer.emitIntValue(0, 4);     // pad
+  }
+
+  // Emit the PAuth ABI compatibility info
+  if (PAuthABIPlatform != uint64_t(-1)) {
+    OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_PAUTH, 4);
+    OutStreamer.emitIntValue(8 * 2, 4); // data size
+    OutStreamer.emitIntValue(PAuthABIPlatform, 8);
+    OutStreamer.emitIntValue(PAuthABIVersion, 8);
+  }
 
   OutStreamer.endSection(Nt);
   OutStreamer.switchSection(Cur);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 7676d88a82b5..e8a9dc445b96 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -35,7 +35,8 @@ public:
   void emitCurrentConstantPool();
 
   /// Callback used to implement the .note.gnu.property section.
-  void emitNoteSection(unsigned Flags);
+  void emitNoteSection(unsigned Flags, uint64_t PAuthABIPlatform = -1,
+                       uint64_t PAuthABIVersion = -1);
 
   /// Callback used to implement the .inst directive.
   virtual void emitInst(uint32_t Inst);
diff --git a/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll b/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll
new file mode 100644
index 000000000000..728cffeba02a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll
@@ -0,0 +1,50 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- ok.ll
+
+; RUN: llc -mtriple=aarch64-linux ok.ll               -o - | \
+; RUN:   FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=aarch64-linux ok.ll -filetype=obj -o - |  \
+; RUN:   llvm-readelf --notes - | FileCheck %s --check-prefix=OBJ
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 85}
+
+; ASM: .section .note.gnu.property,"a",@note
+; ASM-NEXT: .p2align 3, 0x0
+; ASM-NEXT: .word 4
+; ASM-NEXT: .word 24
+; ASM-NEXT: .word 5
+; ASM-NEXT: .asciz "GNU"
+; 3221225473 = 0xc0000001 = GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+; ASM-NEXT: .word 3221225473
+; ASM-NEXT: .word 16
+; ASM-NEXT: .xword 268435458
+; ASM-NEXT: .xword 85
+
+; OBJ: Displaying notes found in: .note.gnu.property
+; OBJ-NEXT:   Owner                 Data size	Description
+; OBJ-NEXT:   GNU                   0x00000018	NT_GNU_PROPERTY_TYPE_0 (property note)
+; OBJ-NEXT:   AArch64 PAuth ABI core info: platform 0x10000002 (llvm_linux), version 0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)
+
+; ERR: either both or no 'aarch64-elf-pauthabi-platform' and 'aarch64-elf-pauthabi-version' module flags must be present
+
+;--- err1.ll
+
+; RUN: not llc -mtriple=aarch64-linux err1.ll 2>&1 -o - | \
+; RUN:   FileCheck %s --check-prefix=ERR
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 2}
+
+;--- err2.ll
+
+; RUN: not llc -mtriple=aarch64-linux err2.ll 2>&1 -o - | \
+; RUN:   FileCheck %s --check-prefix=ERR
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 31}
diff --git a/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll b/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll
new file mode 100644
index 000000000000..435073d01c8e
--- /dev/null
+++ b/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll
@@ -0,0 +1,19 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+; CHECK: either both or no 'aarch64-elf-pauthabi-platform' and 'aarch64-elf-pauthabi-version' module flags must be present
+
+;--- err1.ll
+
+; RUN: not llvm-as err1.ll -o /dev/null 2>&1 | FileCheck %s
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 2}
+
+;--- err2.ll
+
+; RUN: not llvm-as err2.ll -o /dev/null 2>&1 | FileCheck %s
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 31}
-- 
cgit v1.2.3


From 53fe94a0ce262c6e38117429a30814f54ea55b0f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 4 Apr 2024 19:13:03 +0100
Subject: [CostModel][X86] Add costkinds test coverage for masked
 load/store/gather/scatter

Noticed while starting triage for #87640
---
 .../CostModel/X86/masked-intrinsic-codesize.ll     | 2413 ++++++++++++++++++++
 .../X86/masked-intrinsic-cost-inseltpoison.ll      |   16 +-
 .../CostModel/X86/masked-intrinsic-cost.ll         |   16 +-
 .../CostModel/X86/masked-intrinsic-latency.ll      | 2413 ++++++++++++++++++++
 .../CostModel/X86/masked-intrinsic-sizelatency.ll  | 2413 ++++++++++++++++++++
 5 files changed, 7255 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
 create mode 100644 llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
 create mode 100644 llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll

diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
new file mode 100644
index 000000000000..55fdaafc24ed
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index 897344d622d0..ad56c28510f5 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX2
-;
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,SKL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,KNL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,SKX
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
 define i32 @masked_load() {
 ; SSE2-LABEL: 'masked_load'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 5f22b2e39f94..c7e7c46482c4 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX2
-;
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,SKL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,KNL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,SKX
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
 define i32 @masked_load() {
 ; SSE2-LABEL: 'masked_load'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
new file mode 100644
index 000000000000..edb05ad71a40
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
new file mode 100644
index 000000000000..3ebd9cc41c09
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
-- 
cgit v1.2.3


From c83f23d6abb6f8d693c643bc1b43f9b9e06bc537 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Thu, 4 Apr 2024 11:25:44 -0700
Subject: [AArch64] Fix heuristics for folding "lsl" into load/store ops.
 (#86894)

The existing heuristics were assuming that every core behaves like an
Apple A7, where any extend/shift costs an extra micro-op... but in
reality, nothing else behaves like that.

On some older Cortex designs, shifts by 1 or 4 cost extra, but all other
shifts/extensions are free. On all other cores, as far as I can tell,
all shifts/extensions for integer loads are free (i.e. the same cost as
an unshifted load).

To reflect this, this patch:

- Enables aggressive folding of shifts into loads by default.

- Removes the old AddrLSLFast feature, since it applies to everything
except A7 (and even if you are explicitly targeting A7, we want to
assume extensions are free because the code will almost always run on a
newer core).

- Adds a new feature AddrLSLSlow14 that applies specifically to the
Cortex cores where shifts by 1 or 4 cost extra.

I didn't add support for AddrLSLSlow14 on the GlobalISel side because it
would require a bunch of refactoring to work correctly. Someone can pick
this up as a followup.
---
 llvm/lib/Target/AArch64/AArch64.td                 |  53 +++++-----
 llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp    |  29 +++---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp       |   2 +-
 .../AArch64/GISel/AArch64InstructionSelector.cpp   |   6 +-
 .../AArch64/GlobalISel/load-addressing-modes.mir   |  12 +--
 llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll  | 112 +++++++--------------
 .../AArch64/aarch64-split-and-bitmask-immediate.ll |   5 +-
 .../CodeGen/AArch64/arm64-addr-mode-folding.ll     |   2 +-
 llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll     |  20 ++--
 .../CodeGen/AArch64/avoid-free-ext-promotion.ll    |  11 +-
 llvm/test/CodeGen/AArch64/cheap-as-a-move.ll       |  30 +++---
 llvm/test/CodeGen/AArch64/extract-bits.ll          |   5 +-
 .../CodeGen/AArch64/machine-licm-hoist-load.ll     |   5 +-
 llvm/test/CodeGen/AArch64/sink-and-fold.ll         |   4 +-
 14 files changed, 119 insertions(+), 177 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 6425aa9b091f..3af427d526f8 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -391,9 +391,18 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
                                         "equivalent when the immediate does "
                                         "not fit in the encoding.">;
 
-def FeatureAddrLSLFast : SubtargetFeature<
-    "addr-lsl-fast", "HasAddrLSLFast", "true",
-    "Address operands with logical shift of up to 3 places are cheap">;
+// Address operands with shift amount 2 or 3 are fast on all Arm chips except
+// some old Apple cores (A7-A10?) which handle all shifts slowly. Cortex-A57
+// and derived designs through Cortex-X1 take an extra micro-op for shifts
+// of 1 or 4. Other Arm chips handle all shifted operands at the same speed
+// as unshifted operands.
+//
+// We don't try to model the behavior of the old Apple cores because new code
+// targeting A7 is very unlikely to actually run on an A7. The Cortex cores
+// are modeled by FeatureAddrLSLSlow14.
+def FeatureAddrLSLSlow14 : SubtargetFeature<
+    "addr-lsl-slow-14", "HasAddrLSLSlow14", "true",
+    "Address operands with shift amount of 1 or 4 are slow">;
 
 def FeatureALULSLFast : SubtargetFeature<
     "alu-lsl-fast", "HasALULSLFast", "true",
@@ -885,6 +894,7 @@ def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    FeatureBalanceFPOps,
                                    FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals,
+                                   FeatureAddrLSLSlow14,
                                    FeaturePostRAScheduler,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -903,6 +913,7 @@ def TuneA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals,
+                                   FeatureAddrLSLSlow14,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -910,6 +921,7 @@ def TuneA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    "Cortex-A73 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
+                                   FeatureAddrLSLSlow14,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -917,6 +929,7 @@ def TuneA75     : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
                                    "Cortex-A75 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
+                                   FeatureAddrLSLSlow14,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -924,7 +937,7 @@ def TuneA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
                                    "Cortex-A76 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
+                                   FeatureAddrLSLSlow14,
                                    FeatureALULSLFast,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -934,7 +947,7 @@ def TuneA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
+                                   FeatureAddrLSLSlow14,
                                    FeatureALULSLFast,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -944,7 +957,7 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
                                FeatureCmpBccFusion,
                                FeatureFuseAES,
                                FeatureFuseAdrpAdd,
-                               FeatureAddrLSLFast,
+                               FeatureAddrLSLSlow14,
                                FeatureALULSLFast,
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
@@ -956,7 +969,7 @@ def TuneA78AE : SubtargetFeature<"a78ae", "ARMProcFamily",
                                  FeatureCmpBccFusion,
                                  FeatureFuseAES,
                                  FeatureFuseAdrpAdd,
-                                 FeatureAddrLSLFast,
+                                 FeatureAddrLSLSlow14,
                                  FeatureALULSLFast,
                                  FeaturePostRAScheduler,
                                  FeatureEnableSelectOptimize,
@@ -968,7 +981,7 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
                                 FeatureCmpBccFusion,
                                 FeatureFuseAES,
                                 FeatureFuseAdrpAdd,
-                                FeatureAddrLSLFast,
+                                FeatureAddrLSLSlow14,
                                 FeatureALULSLFast,
                                 FeaturePostRAScheduler,
                                 FeatureEnableSelectOptimize,
@@ -979,7 +992,6 @@ def TuneA710    : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeaturePostRAScheduler,
                                    FeatureEnableSelectOptimize,
@@ -990,7 +1002,6 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureAddrLSLFast,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
@@ -1001,7 +1012,6 @@ def TuneA720 : SubtargetFeature<"a720", "ARMProcFamily", "CortexA720",
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureAddrLSLFast,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
@@ -1012,7 +1022,6 @@ def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720",
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureAddrLSLFast,
                                  FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
@@ -1028,7 +1037,7 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
-                                  FeatureAddrLSLFast,
+                                  FeatureAddrLSLSlow14,
                                   FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
@@ -1039,7 +1048,6 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
-                                  FeatureAddrLSLFast,
                                   FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
@@ -1047,7 +1055,6 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
 
 def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
                               "Cortex-X3 ARM processors", [
-                               FeatureAddrLSLFast,
                                FeatureALULSLFast,
                                FeatureFuseAdrpAdd,
                                FeatureFuseAES,
@@ -1057,7 +1064,6 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
 
 def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
                               "Cortex-X4 ARM processors", [
-                               FeatureAddrLSLFast,
                                FeatureALULSLFast,
                                FeatureFuseAdrpAdd,
                                FeatureFuseAES,
@@ -1215,7 +1221,6 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
                                      FeatureStorePairSuppress,
-                                     FeatureAddrLSLFast,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
                                      FeaturePredictableSelectIsExpensive]>;
@@ -1234,7 +1239,6 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
                                      FeatureStorePairSuppress,
-                                     FeatureAddrLSLFast,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
                                      FeatureZCZeroing]>;
@@ -1244,7 +1248,6 @@ def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureStorePairSuppress]>;
 
@@ -1254,7 +1257,6 @@ def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
                                    FeatureStorePairSuppress,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureSlowSTRQro]>;
 
@@ -1268,7 +1270,7 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1
                                       "Neoverse N1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
+                                      FeatureAddrLSLSlow14,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1278,7 +1280,6 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
                                       "Neoverse N2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1288,7 +1289,6 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
                                       "Neoverse 512-TVB ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1298,7 +1298,7 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
                                       "Neoverse V1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
+                                      FeatureAddrLSLSlow14,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1309,7 +1309,6 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       "Neoverse V2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureAddrLSLFast,
                                       FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
@@ -1321,7 +1320,6 @@ def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
                                    FeatureStorePairSuppress,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast]>;
 
 def TuneThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
@@ -1381,7 +1379,6 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
                                    FeaturePostRAScheduler,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureAddrLSLFast,
                                    FeatureALULSLFast,
                                    FeatureAggressiveFMA,
                                    FeatureArithmeticBccFusion,
@@ -1397,7 +1394,6 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeaturePostRAScheduler,
                                     FeatureFuseAES,
                                     FeatureFuseAdrpAdd,
-                                    FeatureAddrLSLFast,
                                     FeatureALULSLFast,
                                     FeatureAggressiveFMA,
                                     FeatureArithmeticBccFusion,
@@ -1414,7 +1410,6 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
                                     FeaturePostRAScheduler,
                                     FeatureFuseAES,
                                     FeatureFuseAdrpAdd,
-                                    FeatureAddrLSLFast,
                                     FeatureALULSLFast,
                                     FeatureAggressiveFMA,
                                     FeatureArithmeticBccFusion,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 163ed520a8a6..51bec3604026 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -462,7 +462,7 @@ private:
                          SDValue &Offset, SDValue &SignExtend,
                          SDValue &DoShift);
   bool isWorthFoldingALU(SDValue V, bool LSL = false) const;
-  bool isWorthFoldingAddr(SDValue V) const;
+  bool isWorthFoldingAddr(SDValue V, unsigned Size) const;
   bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
                          SDValue &Offset, SDValue &SignExtend);
 
@@ -674,17 +674,22 @@ static bool isWorthFoldingSHL(SDValue V) {
 
 /// Determine whether it is worth to fold V into an extended register addressing
 /// mode.
-bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V) const {
+bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V, unsigned Size) const {
   // Trivial if we are optimizing for code size or if there is only
   // one use of the value.
   if (CurDAG->shouldOptForSize() || V.hasOneUse())
     return true;
-  // If a subtarget has a fastpath LSL we can fold a logical shift into
-  // the addressing mode and save a cycle.
-  if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::SHL &&
-      isWorthFoldingSHL(V))
+
+  // If a subtarget has a slow shift, folding a shift into multiple loads
+  // costs additional micro-ops.
+  if (Subtarget->hasAddrLSLSlow14() && (Size == 2 || Size == 16))
+    return false;
+
+  // Check whether we're going to emit the address arithmetic anyway because
+  // it's used by a non-address operation.
+  if (V.getOpcode() == ISD::SHL && isWorthFoldingSHL(V))
     return true;
-  if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::ADD) {
+  if (V.getOpcode() == ISD::ADD) {
     const SDValue LHS = V.getOperand(0);
     const SDValue RHS = V.getOperand(1);
     if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
@@ -1203,7 +1208,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
   if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
     return false;
 
-  return isWorthFoldingAddr(N);
+  return isWorthFoldingAddr(N, Size);
 }
 
 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1231,7 +1236,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
   }
 
   // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
+  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size);
 
   // Try to match a shifted extend on the RHS.
   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
@@ -1261,7 +1266,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
     Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
                                            MVT::i32);
-    if (isWorthFoldingAddr(LHS))
+    if (isWorthFoldingAddr(LHS, Size))
       return true;
   }
 
@@ -1273,7 +1278,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
     Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
                                            MVT::i32);
-    if (isWorthFoldingAddr(RHS))
+    if (isWorthFoldingAddr(RHS, Size))
       return true;
   }
 
@@ -1343,7 +1348,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
   }
 
   // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
+  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size);
 
   // Try to match a shifted extend on the RHS.
   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d0c5e6b99e9e..22687b0e31c2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2993,7 +2993,7 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
       return false;
     Shift = AArch64_AM::getShiftValue(Shift);
     if (!OptSize) {
-      if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast())
+      if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
         return false;
       if (avoidSlowSTRQ(MemI))
         return false;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index a8f2c45279e6..d4daf17a39d5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6907,10 +6907,8 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
       MI.getParent()->getParent()->getFunction().hasOptSize())
     return true;
 
-  // It's better to avoid folding and recomputing shifts when we don't have a
-  // fastpath.
-  if (!STI.hasAddrLSLFast())
-    return false;
+  // FIXME: Consider checking HasAddrLSLSlow14 and HasALULSLFast as
+  // appropriate.
 
   // We have a fastpath, so folding a shift in and potentially computing it
   // many times may be beneficial. Check if this is only used in memory ops.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
index 499c08fa4966..7921de6ce236 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -15,7 +15,7 @@
   define void @mul_wrong_pow_2(ptr %addr) { ret void }
   define void @more_than_one_use_shl_1(ptr %addr) { ret void }
   define void @more_than_one_use_shl_2(ptr %addr) { ret void }
-  define void @more_than_one_use_shl_lsl_fast(ptr %addr) #1 { ret void }
+  define void @more_than_one_use_shl_lsl_fast(ptr %addr) { ret void }
   define void @more_than_one_use_shl_lsl_slow(ptr %addr) { ret void }
   define void @more_than_one_use_shl_minsize(ptr %addr) #0 { ret void }
   define void @ldrwrox(ptr %addr) { ret void }
@@ -24,7 +24,6 @@
   define void @ldbbrox(ptr %addr) { ret void }
   define void @ldrqrox(ptr %addr) { ret void }
   attributes #0 = { optsize }
-  attributes #1 = { "target-features"="+addr-lsl-fast" }
 ...
 
 ---
@@ -478,11 +477,10 @@ body:             |
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK-NEXT: [[ADDXrs:%[0-9]+]]:gpr64common = ADDXrs [[COPY1]], [[COPY]], 3
-    ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK-NEXT: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]]
     ; CHECK-NEXT: $x2 = COPY [[ADDXrr]]
     ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 59cd87f58ab0..022aaea9ef0c 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-slow-14 | FileCheck %s --check-prefixes=CHECK,CHECK0
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK3
 
 %struct.a = type [256 x i16]
 %struct.b = type [256 x i32]
@@ -49,36 +49,20 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind {
 }
 
 define i32 @word(ptr %ctx, i32 %xor72) nounwind {
-; CHECK0-LABEL: word:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK0-NEXT:    ubfx x8, x1, #9, #8
-; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    lsl x21, x8, #2
-; CHECK0-NEXT:    ldr w20, [x0, x21]
-; CHECK0-NEXT:    bl foo
-; CHECK0-NEXT:    mov w0, w20
-; CHECK0-NEXT:    str w20, [x19, x21]
-; CHECK0-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: word:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK3-NEXT:    ubfx x21, x1, #9, #8
-; CHECK3-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK3-NEXT:    mov x19, x0
-; CHECK3-NEXT:    ldr w20, [x0, x21, lsl #2]
-; CHECK3-NEXT:    bl foo
-; CHECK3-NEXT:    mov w0, w20
-; CHECK3-NEXT:    str w20, [x19, x21, lsl #2]
-; CHECK3-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK3-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: word:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ubfx x21, x1, #9, #8
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    ldr w20, [x0, x21, lsl #2]
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:    str w20, [x19, x21, lsl #2]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
   %idxprom83 = and i64 %conv82, 255
@@ -90,36 +74,20 @@ define i32 @word(ptr %ctx, i32 %xor72) nounwind {
 }
 
 define i64 @doubleword(ptr %ctx, i32 %xor72) nounwind {
-; CHECK0-LABEL: doubleword:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK0-NEXT:    ubfx x8, x1, #9, #8
-; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    lsl x21, x8, #3
-; CHECK0-NEXT:    ldr x20, [x0, x21]
-; CHECK0-NEXT:    bl foo
-; CHECK0-NEXT:    mov x0, x20
-; CHECK0-NEXT:    str x20, [x19, x21]
-; CHECK0-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: doubleword:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK3-NEXT:    ubfx x21, x1, #9, #8
-; CHECK3-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK3-NEXT:    mov x19, x0
-; CHECK3-NEXT:    ldr x20, [x0, x21, lsl #3]
-; CHECK3-NEXT:    bl foo
-; CHECK3-NEXT:    mov x0, x20
-; CHECK3-NEXT:    str x20, [x19, x21, lsl #3]
-; CHECK3-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK3-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: doubleword:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ubfx x21, x1, #9, #8
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    ldr x20, [x0, x21, lsl #3]
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    str x20, [x19, x21, lsl #3]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
   %idxprom83 = and i64 %conv82, 255
@@ -163,20 +131,12 @@ endbb:
 }
 
 define i64 @gep3(ptr %p, i64 %b) {
-; CHECK0-LABEL: gep3:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    lsl x9, x1, #3
-; CHECK0-NEXT:    mov x8, x0
-; CHECK0-NEXT:    ldr x0, [x0, x9]
-; CHECK0-NEXT:    str x1, [x8, x9]
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: gep3:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    mov x8, x0
-; CHECK3-NEXT:    ldr x0, [x0, x1, lsl #3]
-; CHECK3-NEXT:    str x1, [x8, x1, lsl #3]
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: gep3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldr x0, [x0, x1, lsl #3]
+; CHECK-NEXT:    str x1, [x8, x1, lsl #3]
+; CHECK-NEXT:    ret
   %g = getelementptr inbounds i64, ptr %p, i64 %b
   %l = load i64, ptr %g
   store i64 %b, ptr %g
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
index 573f921e638c..e31c9a072dc4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
@@ -134,9 +134,8 @@ define void @test8(i64 %a, ptr noalias %src, ptr noalias %dst, i64 %n) {
 ; CHECK-NEXT:    b.hs .LBB7_1
 ; CHECK-NEXT:  // %bb.3: // %if.then
 ; CHECK-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; CHECK-NEXT:    lsl x10, x8, #3
-; CHECK-NEXT:    ldr x11, [x1, x10]
-; CHECK-NEXT:    str x11, [x2, x10]
+; CHECK-NEXT:    ldr x10, [x1, x8, lsl #3]
+; CHECK-NEXT:    str x10, [x2, x8, lsl #3]
 ; CHECK-NEXT:    b .LBB7_1
 ; CHECK-NEXT:  .LBB7_4: // %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
index d593272be1aa..6bcd2f04849b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -125,7 +125,7 @@ return:                                           ; preds = %if.end23, %if.then3
 }
 
 ; CHECK: @test
-; CHECK-NOT: , uxtw #2]
+; CHECK: , uxtw #2]
 define i32 @test(ptr %array, i8 zeroext %c, i32 %arg) {
 entry:
   %conv = zext i8 %c to i32
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
index 3542b26b5353..5b055a4eb37a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -201,11 +201,10 @@ define void @fct1_64x1(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_64x1:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray64x1
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray64x1]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <1 x i64>, ptr %array, i64 %offset
@@ -238,11 +237,10 @@ define void @fct1_32x2(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_32x2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray32x2
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray32x2]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <2 x i32>, ptr %array, i64 %offset
@@ -275,11 +273,10 @@ define void @fct1_16x4(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_16x4:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray16x4
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray16x4]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <4 x i16>, ptr %array, i64 %offset
@@ -312,11 +309,10 @@ define void @fct1_8x8(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_8x8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray8x8
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray8x8]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <8 x i8>, ptr %array, i64 %offset
diff --git a/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll b/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
index 8f195531927e..634d1b90ff90 100644
--- a/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
+++ b/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
@@ -82,13 +82,12 @@ define void @avoid_promotion_2_and(ptr nocapture noundef %arg) {
 ; CHECK-NEXT:    eor w10, w10, w11
 ; CHECK-NEXT:    ldur w11, [x8, #-24]
 ; CHECK-NEXT:    and w10, w10, w14
-; CHECK-NEXT:    ldp x15, x14, [x8, #-16]
-; CHECK-NEXT:    ubfiz x13, x10, #1, #32
+; CHECK-NEXT:    ldp x14, x13, [x8, #-16]
 ; CHECK-NEXT:    str w10, [x8]
-; CHECK-NEXT:    and w10, w11, w12
-; CHECK-NEXT:    ldrh w11, [x14, x13]
-; CHECK-NEXT:    strh w11, [x15, w10, uxtw #1]
-; CHECK-NEXT:    strh w12, [x14, x13]
+; CHECK-NEXT:    and w11, w11, w12
+; CHECK-NEXT:    ldrh w15, [x13, w10, uxtw #1]
+; CHECK-NEXT:    strh w15, [x14, w11, uxtw #1]
+; CHECK-NEXT:    strh w12, [x13, w10, uxtw #1]
 ; CHECK-NEXT:    b LBB1_1
 ; CHECK-NEXT:  LBB1_4: ; %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
index b5c21044266c..50c70c5676c4 100644
--- a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
+++ b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux"
 define void @f0(ptr %a, i64 %n) {
 ; CHECK-LABEL: f0:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
@@ -15,7 +15,6 @@ define void @f0(ptr %a, i64 %n) {
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
 ; CHECK-NEXT:    .cfi_offset w30, -48
 ; CHECK-NEXT:    mov x21, #1 // =0x1
 ; CHECK-NEXT:    mov x19, x1
@@ -27,18 +26,17 @@ define void @f0(ptr %a, i64 %n) {
 ; CHECK-NEXT:    b.ge .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: // %loop.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsl x23, x22, #2
+; CHECK-NEXT:    ldr w0, [x20, x22, lsl #2]
 ; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    ldr w0, [x20, x23]
 ; CHECK-NEXT:    bl g
-; CHECK-NEXT:    str w0, [x20, x23]
+; CHECK-NEXT:    str w0, [x20, x22, lsl #2]
 ; CHECK-NEXT:    add x22, x22, #1
 ; CHECK-NEXT:    cmp x22, x19
 ; CHECK-NEXT:    b.lt .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: // %exit
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   br label %loop
@@ -64,15 +62,13 @@ exit:
 define void @f1(ptr %a, i64 %n) {
 ; CHECK-LABEL: f1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    mov x19, x1
 ; CHECK-NEXT:    mov x20, x0
 ; CHECK-NEXT:    mov x21, xzr
@@ -80,19 +76,17 @@ define void @f1(ptr %a, i64 %n) {
 ; CHECK-NEXT:    b.ge .LBB1_2
 ; CHECK-NEXT:  .LBB1_1: // %loop.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsl x22, x21, #2
+; CHECK-NEXT:    ldr w0, [x20, x21, lsl #2]
 ; CHECK-NEXT:    mov x1, #1450704896 // =0x56780000
 ; CHECK-NEXT:    movk x1, #4660, lsl #48
-; CHECK-NEXT:    ldr w0, [x20, x22]
 ; CHECK-NEXT:    bl g
-; CHECK-NEXT:    str w0, [x20, x22]
+; CHECK-NEXT:    str w0, [x20, x21, lsl #2]
 ; CHECK-NEXT:    add x21, x21, #1
 ; CHECK-NEXT:    cmp x21, x19
 ; CHECK-NEXT:    b.lt .LBB1_1
 ; CHECK-NEXT:  .LBB1_2: // %exit
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
index d4ea143a3d84..b87157a18383 100644
--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
@@ -972,10 +972,9 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x1]
 ; CHECK-NEXT:    ubfx x8, x8, #21, #10
-; CHECK-NEXT:    lsl x8, x8, #2
-; CHECK-NEXT:    ldr w9, [x0, x8]
+; CHECK-NEXT:    ldr w9, [x0, x8, lsl #2]
 ; CHECK-NEXT:    add w9, w9, #1
-; CHECK-NEXT:    str w9, [x0, x8]
+; CHECK-NEXT:    str w9, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ret
   %tmp = load i64, ptr %a1, align 8
   %tmp1 = lshr i64 %tmp, 21
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index 30123a31cebb..e8dafd5e8fba 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -223,10 +223,9 @@ define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
 ; CHECK-NEXT:    // Parent Loop BB3_1 Depth=1
 ; CHECK-NEXT:    // => This Loop Header: Depth=2
 ; CHECK-NEXT:    // Child Loop BB3_3 Depth 3
-; CHECK-NEXT:    lsl x12, x11, #3
+; CHECK-NEXT:    ldr x13, [x1, x11, lsl #3]
+; CHECK-NEXT:    ldr x12, [x10, x11, lsl #3]
 ; CHECK-NEXT:    mov x14, x4
-; CHECK-NEXT:    ldr x13, [x1, x12]
-; CHECK-NEXT:    ldr x12, [x10, x12]
 ; CHECK-NEXT:    ldr w13, [x13]
 ; CHECK-NEXT:  .LBB3_3: // %for.body8
 ; CHECK-NEXT:    // Parent Loop BB3_1 Depth=1
diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
index 52007221e12a..f65a08ae2ace 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
@@ -100,7 +100,7 @@ exit:
 }
 
 ; Address calculation cheap enough on some cores.
-define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind  "target-features"="+alu-lsl-fast,+addr-lsl-fast" {
+define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind  "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: f3:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    tbz w0, #0, .LBB3_2
@@ -130,7 +130,7 @@ exit:
   ret i32 %v
 }
 
-define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast,+addr-lsl-fast" {
+define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: f4:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    cmp x1, #1
-- 
cgit v1.2.3


From 5ae143da459a73d0f6fc796d42c3fabf3ab5f9e8 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 4 Apr 2024 11:02:45 -0700
Subject: [SLP]Add a test with the incorrect casting for external user, NFC.

---
 .../X86/external-user-instruction-minbitwidth.ll   | 64 ++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
new file mode 100644
index 000000000000..9d3d602d0743
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@e = global i8 0
+@c = global i16 0
+@d = global i32 0
+
+define i8 @test() {
+; CHECK-LABEL: define i8 @test() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @e, align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @c, align 2
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i16> [[TMP4]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 -32767>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <8 x i32> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[CONV4_30:%.*]] = trunc i32 [[TMP11]] to i8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[XOR_31:%.*]] = and i32 [[TMP13]], -2
+; CHECK-NEXT:    store i32 [[XOR_31]], ptr @d, align 4
+; CHECK-NEXT:    ret i8 [[CONV4_30]]
+;
+entry:
+  %0 = load i8, ptr @e, align 1
+  %conv = sext i8 %0 to i32
+  %1 = load i16, ptr @c, align 2
+  %conv1 = zext i16 %1 to i32
+  %or.16 = or i32 %conv, 1
+  %add.16 = add nsw i32 %or.16, %conv1
+  %or.18 = or i32 %conv, 1
+  %add.18 = add nsw i32 %or.18, %conv1
+  %conv4.181 = or i32 %add.16, %add.18
+  %or.20 = or i32 %conv, 1
+  %add.20 = add nsw i32 %or.20, %conv1
+  %conv4.202 = or i32 %conv4.181, %add.20
+  %or.22 = or i32 %conv, 1
+  %add.22 = add nsw i32 %or.22, %conv1
+  %conv4.223 = or i32 %conv4.202, %add.22
+  %or.24 = or i32 %conv, 1
+  %add.24 = add nsw i32 %or.24, %conv1
+  %conv4.244 = or i32 %conv4.223, %add.24
+  %or.26 = or i32 %conv, 1
+  %add.26 = add nsw i32 %or.26, %conv1
+  %conv4.265 = or i32 %conv4.244, %add.26
+  %or.28 = or i32 %conv, 1
+  %add.28 = add nsw i32 %or.28, %conv1
+  %conv4.286 = or i32 %conv4.265, %add.28
+  %or.30 = or i32 %conv, 32769
+  %add.30 = add nsw i32 %or.30, %conv1
+  %conv4.307 = or i32 %conv4.286, %add.30
+  %conv4.30 = trunc i32 %conv4.307 to i8
+  %xor.31 = and i32 %or.30, -2
+  store i32 %xor.31, ptr @d, align 4
+  ret i8 %conv4.30
+}
-- 
cgit v1.2.3


From 8004ce236795d48063c636d1ce4818ebcb526b21 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 4 Apr 2024 13:33:19 -0500
Subject: [libc] Move thread sync when closing port earlier

Summary:
This synchronization should be done before we handle the logic relating
to closing the port. This isn't majorly important now but it would break
if we ever decided to run a server on the GPU.
---
 libc/src/__support/RPC/rpc.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 5dcae518bb6f..05506c04fc07 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -198,12 +198,9 @@ template <bool Invert> struct Process {
   /// convergent, otherwise the compiler will sink the store and deadlock.
   [[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
                                                 uint32_t index) {
-    // Do not move any writes past the unlock
+    // Do not move any writes past the unlock.
     atomic_thread_fence(cpp::MemoryOrder::RELEASE);
 
-    // Wait for other threads in the warp to finish using the lock
-    gpu::sync_lane(lane_mask);
-
     // Use exactly one thread to clear the nth bit in the lock array Must
     // restrict to a single thread to avoid one thread dropping the lock, then
     // an unrelated warp claiming the lock, then a second thread in this warp
@@ -331,6 +328,9 @@ public:
   LIBC_INLINE uint16_t get_index() const { return index; }
 
   LIBC_INLINE void close() {
+    // Wait for all lanes to finish using the port.
+    gpu::sync_lane(lane_mask);
+
     // The server is passive, if it own the buffer when it closes we need to
     // give ownership back to the client.
     if (owns_buffer && T)
-- 
cgit v1.2.3


From d6713ad80d6907210c629f22babaf12177fa329c Mon Sep 17 00:00:00 2001
From: Kevin Frei <kevinfrei@users.noreply.github.com>
Date: Thu, 4 Apr 2024 11:43:55 -0700
Subject: Debuginfod Testing & fixes: 3rd times the charm? (#87676)

I believe I've got the tests properly configured to only run on Linux
x86(_64), as I don't have a Linux AArch64/Arm device to diagnose what's
going wrong with the tests (I suspect there's some issue with generating
`.note.gnu.build-id` sections...)

The actual code fixes have now been reviewed 3 times:
https://github.com/llvm/llvm-project/pull/79181 (moved shell tests to
API tests), https://github.com/llvm/llvm-project/pull/85693 (Changed
some of the testing infra), and
https://github.com/llvm/llvm-project/pull/86812 (didn't get the tests
configured quite right). The Debuginfod integration for symbol
acquisition in LLDB now works with the `executable` and `debuginfo`
Debuginfod network requests working properly for normal, `objcopy
--only-keep-debug` stripped, split-dwarf, and `objcopy
--only-keep-debug` stripped *plus* split-dwarf symbols/binaries.

The reasons for the multiple attempts have been tests on platforms I
don't have access to (Linux AArch64/Arm + MacOS x86_64). I believe I've
got the tests properly disabled for everything except for Linux x86(_64)
now. I've built & tested on MacOS AArch64 and Linux x86_64.

---------

Co-authored-by: Kevin Frei <freik@meta.com>
---
 .../Python/lldbsuite/test/make/Makefile.rules      |  26 ++-
 .../Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp   |  38 ++--
 lldb/source/Plugins/SymbolLocator/CMakeLists.txt   |   7 +-
 .../Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp   |  29 +++-
 lldb/test/API/debuginfod/Normal/Makefile           |  19 ++
 lldb/test/API/debuginfod/Normal/TestDebuginfod.py  | 183 +++++++++++++++++++
 lldb/test/API/debuginfod/Normal/main.c             |   7 +
 lldb/test/API/debuginfod/SplitDWARF/Makefile       |  23 +++
 .../API/debuginfod/SplitDWARF/TestDebuginfodDWP.py | 193 +++++++++++++++++++++
 lldb/test/API/debuginfod/SplitDWARF/main.c         |   7 +
 10 files changed, 515 insertions(+), 17 deletions(-)
 create mode 100644 lldb/test/API/debuginfod/Normal/Makefile
 create mode 100644 lldb/test/API/debuginfod/Normal/TestDebuginfod.py
 create mode 100644 lldb/test/API/debuginfod/Normal/main.c
 create mode 100644 lldb/test/API/debuginfod/SplitDWARF/Makefile
 create mode 100644 lldb/test/API/debuginfod/SplitDWARF/TestDebuginfodDWP.py
 create mode 100644 lldb/test/API/debuginfod/SplitDWARF/main.c

diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index bfd249ccd43f..ee8793fa1b30 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -51,7 +51,7 @@ LLDB_BASE_DIR := $(THIS_FILE_DIR)/../../../../../
 #
 # GNUWin32 uname gives "windows32" or "server version windows32" while
 # some versions of MSYS uname return "MSYS_NT*", but most environments
-# standardize on "Windows_NT", so we'll make it consistent here. 
+# standardize on "Windows_NT", so we'll make it consistent here.
 # When running tests from Visual Studio, the environment variable isn't
 # inherited all the way down to the process spawned for make.
 #----------------------------------------------------------------------
@@ -210,6 +210,12 @@ else
 	ifeq "$(SPLIT_DEBUG_SYMBOLS)" "YES"
 		DSYM = $(EXE).debug
 	endif
+
+	ifeq "$(MAKE_DWP)" "YES"
+		MAKE_DWO := YES
+		DWP_NAME = $(EXE).dwp
+		DYLIB_DWP_NAME = $(DYLIB_NAME).dwp
+	endif
 endif
 
 LIMIT_DEBUG_INFO_FLAGS =
@@ -357,6 +363,7 @@ ifneq "$(OS)" "Darwin"
 
 	OBJCOPY ?= $(call replace_cc_with,objcopy)
 	ARCHIVER ?= $(call replace_cc_with,ar)
+	DWP ?= $(call replace_cc_with,dwp)
 	override AR = $(ARCHIVER)
 endif
 
@@ -527,6 +534,10 @@ ifneq "$(CXX)" ""
 	endif
 endif
 
+ifeq "$(GEN_GNU_BUILD_ID)" "YES"
+	LDFLAGS += -Wl,--build-id
+endif
+
 #----------------------------------------------------------------------
 # DYLIB_ONLY variable can be used to skip the building of a.out.
 # See the sections below regarding dSYM file as well as the building of
@@ -565,10 +576,17 @@ else
 endif
 else
 ifeq "$(SPLIT_DEBUG_SYMBOLS)" "YES"
+ifeq "$(SAVE_FULL_DEBUG_BINARY)" "YES"
+	cp "$(EXE)" "$(EXE).unstripped"
+endif
 	$(OBJCOPY) --only-keep-debug "$(EXE)" "$(DSYM)"
 	$(OBJCOPY) --strip-debug --add-gnu-debuglink="$(DSYM)" "$(EXE)" "$(EXE)"
 endif
+ifeq "$(MAKE_DWP)" "YES"
+	$(DWP) -o "$(DWP_NAME)" $(DWOS)
 endif
+endif
+
 
 #----------------------------------------------------------------------
 # Make the dylib
@@ -610,9 +628,15 @@ endif
 else
 	$(LD) $(DYLIB_OBJECTS) $(LDFLAGS) -shared -o "$(DYLIB_FILENAME)"
 ifeq "$(SPLIT_DEBUG_SYMBOLS)" "YES"
+	ifeq "$(SAVE_FULL_DEBUG_BINARY)" "YES"
+	cp "$(DYLIB_FILENAME)" "$(DYLIB_FILENAME).unstripped"
+	endif
 	$(OBJCOPY) --only-keep-debug "$(DYLIB_FILENAME)" "$(DYLIB_FILENAME).debug"
 	$(OBJCOPY) --strip-debug --add-gnu-debuglink="$(DYLIB_FILENAME).debug" "$(DYLIB_FILENAME)" "$(DYLIB_FILENAME)"
 endif
+ifeq "$(MAKE_DWP)" "YES"
+	$(DWP) -o $(DYLIB_DWP_FILE) $(DYLIB_DWOS)
+endif
 endif
 
 #----------------------------------------------------------------------
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 49f13d2c89e3..dafdf241f9db 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -4378,26 +4378,38 @@ const std::shared_ptr<SymbolFileDWARFDwo> &SymbolFileDWARF::GetDwpSymbolFile() {
     FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths();
     ModuleSpec module_spec;
     module_spec.GetFileSpec() = m_objfile_sp->GetFileSpec();
+    FileSpec dwp_filespec;
     for (const auto &symfile : symfiles.files()) {
       module_spec.GetSymbolFileSpec() =
           FileSpec(symfile.GetPath() + ".dwp", symfile.GetPathStyle());
       LLDB_LOG(log, "Searching for DWP using: \"{0}\"",
                module_spec.GetSymbolFileSpec());
-      FileSpec dwp_filespec =
+      dwp_filespec =
           PluginManager::LocateExecutableSymbolFile(module_spec, search_paths);
       if (FileSystem::Instance().Exists(dwp_filespec)) {
-        LLDB_LOG(log, "Found DWP file: \"{0}\"", dwp_filespec);
-        DataBufferSP dwp_file_data_sp;
-        lldb::offset_t dwp_file_data_offset = 0;
-        ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin(
-            GetObjectFile()->GetModule(), &dwp_filespec, 0,
-            FileSystem::Instance().GetByteSize(dwp_filespec), dwp_file_data_sp,
-            dwp_file_data_offset);
-        if (dwp_obj_file) {
-          m_dwp_symfile = std::make_shared<SymbolFileDWARFDwo>(
-              *this, dwp_obj_file, DIERef::k_file_index_mask);
-          break;
-        }
+        break;
+      }
+    }
+    if (!FileSystem::Instance().Exists(dwp_filespec)) {
+      LLDB_LOG(log, "No DWP file found locally");
+      // Fill in the UUID for the module we're trying to match for, so we can
+      // find the correct DWP file, as the Debuginfod plugin uses *only* this
+      // data to correctly match the DWP file with the binary.
+      module_spec.GetUUID() = m_objfile_sp->GetUUID();
+      dwp_filespec =
+          PluginManager::LocateExecutableSymbolFile(module_spec, search_paths);
+    }
+    if (FileSystem::Instance().Exists(dwp_filespec)) {
+      LLDB_LOG(log, "Found DWP file: \"{0}\"", dwp_filespec);
+      DataBufferSP dwp_file_data_sp;
+      lldb::offset_t dwp_file_data_offset = 0;
+      ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin(
+          GetObjectFile()->GetModule(), &dwp_filespec, 0,
+          FileSystem::Instance().GetByteSize(dwp_filespec), dwp_file_data_sp,
+          dwp_file_data_offset);
+      if (dwp_obj_file) {
+        m_dwp_symfile = std::make_shared<SymbolFileDWARFDwo>(
+            *this, dwp_obj_file, DIERef::k_file_index_mask);
       }
     }
     if (!m_dwp_symfile) {
diff --git a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
index ca969626f4ff..3367022639ab 100644
--- a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
+++ b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
@@ -1,5 +1,10 @@
+# Order matters here: the first symbol locator prevents further searching.
+# For DWARF binaries that are both stripped and split, the Default plugin
+# will return the stripped binary when asked for the ObjectFile, which then
+# prevents an unstripped binary from being requested from the Debuginfod
+# provider.
+add_subdirectory(Debuginfod)
 add_subdirectory(Default)
 if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
   add_subdirectory(DebugSymbols)
 endif()
-add_subdirectory(Debuginfod)
diff --git a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
index b5fe35d71032..f296e655cc46 100644
--- a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
+++ b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
@@ -44,6 +44,24 @@ llvm::StringRef SymbolVendorELF::GetPluginDescriptionStatic() {
          "executables.";
 }
 
+// If this is needed elsewhere, it can be exported/moved.
+static bool IsDwpSymbolFile(const lldb::ModuleSP &module_sp,
+                            const FileSpec &file_spec) {
+  DataBufferSP dwp_file_data_sp;
+  lldb::offset_t dwp_file_data_offset = 0;
+  // Try to create an ObjectFile from the file_spec.
+  ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin(
+      module_sp, &file_spec, 0, FileSystem::Instance().GetByteSize(file_spec),
+      dwp_file_data_sp, dwp_file_data_offset);
+  // The presence of a debug_cu_index section is the key identifying feature of
+  // a DWP file. Make sure we don't fill in the section list on dwp_obj_file
+  // (by calling GetSectionList(false)) as this function could be called before
+  // we may have all the symbol files collected and available.
+  return dwp_obj_file && ObjectFileELF::classof(dwp_obj_file.get()) &&
+         dwp_obj_file->GetSectionList(false)->FindSectionByType(
+             eSectionTypeDWARFDebugCuIndex, false);
+}
+
 // CreateInstance
 //
 // Platforms can register a callback to use when creating symbol vendors to
@@ -87,8 +105,15 @@ SymbolVendorELF::CreateInstance(const lldb::ModuleSP &module_sp,
   FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths();
   FileSpec dsym_fspec =
       PluginManager::LocateExecutableSymbolFile(module_spec, search_paths);
-  if (!dsym_fspec)
-    return nullptr;
+  if (!dsym_fspec || IsDwpSymbolFile(module_sp, dsym_fspec)) {
+    // If we have a stripped binary or if we got a DWP file, we should prefer
+    // symbols in the executable acquired through a plugin.
+    ModuleSpec unstripped_spec =
+        PluginManager::LocateExecutableObjectFile(module_spec);
+    if (!unstripped_spec)
+      return nullptr;
+    dsym_fspec = unstripped_spec.GetFileSpec();
+  }
 
   DataBufferSP dsym_file_data_sp;
   lldb::offset_t dsym_file_data_offset = 0;
diff --git a/lldb/test/API/debuginfod/Normal/Makefile b/lldb/test/API/debuginfod/Normal/Makefile
new file mode 100644
index 000000000000..54bd7adae241
--- /dev/null
+++ b/lldb/test/API/debuginfod/Normal/Makefile
@@ -0,0 +1,19 @@
+C_SOURCES := main.c
+
+# For normal (non DWP) Debuginfod tests, we need:
+
+# * The full binary: a.out.unstripped
+#   Produced by Makefile.rules with SAVE_FULL_DEBUG_BINARY set to YES and
+#   SPLIT_DEBUG_SYMBOLS set to YES
+
+# * The stripped binary (a.out)
+#   Produced by Makefile.rules with SPLIT_DEBUG_SYMBOLS set to YES
+
+# * The 'only-keep-debug' binary (a.out.debug)
+#   Produced below
+
+SPLIT_DEBUG_SYMBOLS := YES
+SAVE_FULL_DEBUG_BINARY := YES
+GEN_GNU_BUILD_ID := YES
+
+include Makefile.rules
diff --git a/lldb/test/API/debuginfod/Normal/TestDebuginfod.py b/lldb/test/API/debuginfod/Normal/TestDebuginfod.py
new file mode 100644
index 000000000000..f1be1e705ee0
--- /dev/null
+++ b/lldb/test/API/debuginfod/Normal/TestDebuginfod.py
@@ -0,0 +1,183 @@
+import os
+import shutil
+import tempfile
+
+import lldb
+from lldbsuite.test.decorators import *
+import lldbsuite.test.lldbutil as lldbutil
+from lldbsuite.test.lldbtest import *
+
+
+"""
+Test support for the DebugInfoD network symbol acquisition protocol.
+This one is for simple / no split-dwarf scenarios.
+
+For no-split-dwarf scenarios, there are 2 variations:
+1 - A stripped binary with it's corresponding unstripped binary:
+2 - A stripped binary with a corresponding --only-keep-debug symbols file
+"""
+
+
+# It looks like Linux-AArch64 doesn't support build-id's on the LLDB builtbots
+class DebugInfodTests(TestBase):
+    # No need to try every flavor of debug inf.
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_normal_no_symbols(self):
+        """
+        Validate behavior with no symbols or symbol locator.
+        ('baseline negative' behavior)
+        """
+        test_root = self.config_test(["a.out"])
+        self.try_breakpoint(False)
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_normal_default(self):
+        """
+        Validate behavior with symbols, but no symbol locator.
+        ('baseline positive' behavior)
+        """
+        test_root = self.config_test(["a.out", "a.out.debug"])
+        self.try_breakpoint(True)
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_debuginfod_symbols(self):
+        """
+        Test behavior with the full binary available from Debuginfod as
+        'debuginfo' from the plug-in.
+        """
+        test_root = self.config_test(["a.out"], "a.out.unstripped")
+        self.try_breakpoint(True)
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_debuginfod_executable(self):
+        """
+        Test behavior with the full binary available from Debuginfod as
+        'executable' from the plug-in.
+        """
+        test_root = self.config_test(["a.out"], None, "a.out.unstripped")
+        self.try_breakpoint(True)
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_debuginfod_okd_symbols(self):
+        """
+        Test behavior with the 'only-keep-debug' symbols available from Debuginfod.
+        """
+        test_root = self.config_test(["a.out"], "a.out.debug")
+        self.try_breakpoint(True)
+
+    def try_breakpoint(self, should_have_loc):
+        """
+        This function creates a target from self.aout, sets a function-name
+        breakpoint, and checks to see if we have a file/line location,
+        as a way to validate that the symbols have been loaded.
+        should_have_loc specifies if we're testing that symbols have or
+        haven't been loaded.
+        """
+        target = self.dbg.CreateTarget(self.aout)
+        self.assertTrue(target and target.IsValid(), "Target is valid")
+
+        bp = target.BreakpointCreateByName("func")
+        self.assertTrue(bp and bp.IsValid(), "Breakpoint is valid")
+        self.assertEqual(bp.GetNumLocations(), 1)
+
+        loc = bp.GetLocationAtIndex(0)
+        self.assertTrue(loc and loc.IsValid(), "Location is valid")
+        addr = loc.GetAddress()
+        self.assertTrue(addr and addr.IsValid(), "Loc address is valid")
+        line_entry = addr.GetLineEntry()
+        self.assertEqual(
+            should_have_loc,
+            line_entry != None and line_entry.IsValid(),
+            "Loc line entry is valid",
+        )
+        if should_have_loc:
+            self.assertEqual(line_entry.GetLine(), 4)
+            self.assertEqual(
+                line_entry.GetFileSpec().GetFilename(),
+                self.main_source_file.GetFilename(),
+            )
+        self.dbg.DeleteTarget(target)
+        shutil.rmtree(self.tmp_dir)
+
+    def config_test(self, local_files, debuginfo=None, executable=None):
+        """
+        Set up a test with local_files[] copied to a different location
+        so that we control which files are, or are not, found in the file system.
+        Also, create a stand-alone file-system 'hosted' debuginfod server with the
+        provided debuginfo and executable files (if they exist)
+
+        Make the filesystem look like:
+
+        /tmp/<tmpdir>/test/[local_files]
+
+        /tmp/<tmpdir>/cache (for lldb to use as a temp cache)
+
+        /tmp/<tmpdir>/buildid/<uuid>/executable -> <executable>
+        /tmp/<tmpdir>/buildid/<uuid>/debuginfo -> <debuginfo>
+        Returns the /tmp/<tmpdir> path
+        """
+
+        self.build()
+
+        uuid = self.getUUID("a.out")
+        if not uuid:
+            self.fail("Could not get UUID for a.out")
+            return
+        self.main_source_file = lldb.SBFileSpec("main.c")
+        self.tmp_dir = tempfile.mkdtemp()
+        test_dir = os.path.join(self.tmp_dir, "test")
+        os.makedirs(test_dir)
+
+        self.aout = ""
+        # Copy the files used by the test:
+        for f in local_files:
+            shutil.copy(self.getBuildArtifact(f), test_dir)
+            # The first item is the binary to be used for the test
+            if self.aout == "":
+                self.aout = os.path.join(test_dir, f)
+
+        use_debuginfod = debuginfo != None or executable != None
+
+        # Populated the 'file://... mocked' Debuginfod server:
+        if use_debuginfod:
+            os.makedirs(os.path.join(self.tmp_dir, "cache"))
+            uuid_dir = os.path.join(self.tmp_dir, "buildid", uuid)
+            os.makedirs(uuid_dir)
+            if debuginfo:
+                shutil.copy(
+                    self.getBuildArtifact(debuginfo),
+                    os.path.join(uuid_dir, "debuginfo"),
+                )
+            if executable:
+                shutil.copy(
+                    self.getBuildArtifact(executable),
+                    os.path.join(uuid_dir, "executable"),
+                )
+
+        # Configure LLDB for the test:
+        self.runCmd(
+            "settings set symbols.enable-external-lookup %s"
+            % str(use_debuginfod).lower()
+        )
+        self.runCmd("settings clear plugin.symbol-locator.debuginfod.server-urls")
+        if use_debuginfod:
+            self.runCmd(
+                "settings set plugin.symbol-locator.debuginfod.cache-path %s/cache"
+                % self.tmp_dir
+            )
+            self.runCmd(
+                "settings insert-before plugin.symbol-locator.debuginfod.server-urls 0 file://%s"
+                % self.tmp_dir
+            )
+
+    def getUUID(self, filename):
+        try:
+            target = self.dbg.CreateTarget(self.getBuildArtifact(filename))
+            module = target.GetModuleAtIndex(0)
+            uuid = module.GetUUIDString().replace("-", "").lower()
+            self.dbg.DeleteTarget(target)
+            return uuid if len(uuid) == 40 else None
+        except:
+            return None
diff --git a/lldb/test/API/debuginfod/Normal/main.c b/lldb/test/API/debuginfod/Normal/main.c
new file mode 100644
index 000000000000..4c7184609b45
--- /dev/null
+++ b/lldb/test/API/debuginfod/Normal/main.c
@@ -0,0 +1,7 @@
+// This is a dump little pair of test files
+
+int func(int argc, const char *argv[]) {
+  return (argc + 1) * (argv[argc][0] + 2);
+}
+
+int main(int argc, const char *argv[]) { return func(0, argv); }
diff --git a/lldb/test/API/debuginfod/SplitDWARF/Makefile b/lldb/test/API/debuginfod/SplitDWARF/Makefile
new file mode 100644
index 000000000000..3ab9a969e5a4
--- /dev/null
+++ b/lldb/test/API/debuginfod/SplitDWARF/Makefile
@@ -0,0 +1,23 @@
+C_SOURCES := main.c
+
+# For split-dwarf Debuginfod tests, we need:
+
+# * A .DWP file (a.out.dwp)
+#   Produced by Makefile.rules with MAKE_DWP set to YES
+
+# * The "full" binary (missing things that live in .dwo's) (a.out.unstripped)
+#   Produced by Makefile.rules with SAVE_FULL_DEBUG_BINARY set to YES and
+#   SPLIT_DEBUG_SYMBOLS set to YES
+
+# * The stripped binary (a.out)
+#   Produced by Makefile.rules
+
+# * The 'only-keep-debug' binary (a.out.debug)
+#   Produced below
+
+MAKE_DWP := YES
+SPLIT_DEBUG_SYMBOLS := YES
+SAVE_FULL_DEBUG_BINARY := YES
+GEN_GNU_BUILD_ID := YES
+
+include Makefile.rules
diff --git a/lldb/test/API/debuginfod/SplitDWARF/TestDebuginfodDWP.py b/lldb/test/API/debuginfod/SplitDWARF/TestDebuginfodDWP.py
new file mode 100644
index 000000000000..fec2fa1132d7
--- /dev/null
+++ b/lldb/test/API/debuginfod/SplitDWARF/TestDebuginfodDWP.py
@@ -0,0 +1,193 @@
+"""
+Test support for the DebugInfoD network symbol acquisition protocol.
+"""
+import os
+import shutil
+import tempfile
+
+import lldb
+from lldbsuite.test.decorators import *
+import lldbsuite.test.lldbutil as lldbutil
+from lldbsuite.test.lldbtest import *
+
+
+"""
+Test support for the DebugInfoD network symbol acquisition protocol.
+This file is for split-dwarf (dwp) scenarios.
+
+1 - A split binary target with it's corresponding DWP file
+2 - A stripped, split binary target with an unstripped binary and a DWP file
+3 - A stripped, split binary target with an --only-keep-debug symbols file and a DWP file
+"""
+
+
+# It looks like Linux-AArch64 doesn't support build-id's on the LLDB builtbots
+class DebugInfodDWPTests(TestBase):
+    # No need to try every flavor of debug inf.
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_normal_stripped(self):
+        """
+        Validate behavior with a stripped binary, no symbols or symbol locator.
+        """
+        self.config_test(["a.out"])
+        self.try_breakpoint(False)
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_normal_stripped_split_with_dwp(self):
+        """
+        Validate behavior with symbols, but no symbol locator.
+        """
+        self.config_test(["a.out", "a.out.debug", "a.out.dwp"])
+        self.try_breakpoint(True)
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_normal_stripped_only_dwp(self):
+        """
+        Validate behavior *with* dwp symbols only, but missing other symbols,
+        but no symbol locator. This shouldn't work: without the other symbols
+        DWO's appear mostly useless.
+        """
+        self.config_test(["a.out", "a.out.dwp"])
+        self.try_breakpoint(False)
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_debuginfod_dwp_from_service(self):
+        """
+        Test behavior with the unstripped binary, and DWP from the service.
+        """
+        self.config_test(["a.out.debug"], "a.out.dwp")
+        self.try_breakpoint(True)
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_debuginfod_both_symfiles_from_service(self):
+        """
+        Test behavior with a stripped binary, with the unstripped binary and
+        dwp symbols from Debuginfod.
+        """
+        self.config_test(["a.out"], "a.out.dwp", "a.out.unstripped")
+        self.try_breakpoint(True)
+
+    @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"]))
+    def test_debuginfod_both_okd_symfiles_from_service(self):
+        """
+        Test behavior with both the only-keep-debug symbols and the dwp symbols
+        from Debuginfod.
+        """
+        self.config_test(["a.out"], "a.out.dwp", "a.out.debug")
+        self.try_breakpoint(True)
+
+    def try_breakpoint(self, should_have_loc):
+        """
+        This function creates a target from self.aout, sets a function-name
+        breakpoint, and checks to see if we have a file/line location,
+        as a way to validate that the symbols have been loaded.
+        should_have_loc specifies if we're testing that symbols have or
+        haven't been loaded.
+        """
+        target = self.dbg.CreateTarget(self.aout)
+        self.assertTrue(target and target.IsValid(), "Target is valid")
+
+        bp = target.BreakpointCreateByName("func")
+        self.assertTrue(bp and bp.IsValid(), "Breakpoint is valid")
+        self.assertEqual(bp.GetNumLocations(), 1)
+
+        loc = bp.GetLocationAtIndex(0)
+        self.assertTrue(loc and loc.IsValid(), "Location is valid")
+        addr = loc.GetAddress()
+        self.assertTrue(addr and addr.IsValid(), "Loc address is valid")
+        line_entry = addr.GetLineEntry()
+        self.assertEqual(
+            should_have_loc,
+            line_entry != None and line_entry.IsValid(),
+            "Loc line entry is valid",
+        )
+        if should_have_loc:
+            self.assertEqual(line_entry.GetLine(), 4)
+            self.assertEqual(
+                line_entry.GetFileSpec().GetFilename(),
+                self.main_source_file.GetFilename(),
+            )
+        self.dbg.DeleteTarget(target)
+        shutil.rmtree(self.tmp_dir)
+
+    def config_test(self, local_files, debuginfo=None, executable=None):
+        """
+        Set up a test with local_files[] copied to a different location
+        so that we control which files are, or are not, found in the file system.
+        Also, create a stand-alone file-system 'hosted' debuginfod server with the
+        provided debuginfo and executable files (if they exist)
+
+        Make the filesystem look like:
+
+        /tmp/<tmpdir>/test/[local_files]
+
+        /tmp/<tmpdir>/cache (for lldb to use as a temp cache)
+
+        /tmp/<tmpdir>/buildid/<uuid>/executable -> <executable>
+        /tmp/<tmpdir>/buildid/<uuid>/debuginfo -> <debuginfo>
+        Returns the /tmp/<tmpdir> path
+        """
+
+        self.build()
+
+        uuid = self.getUUID("a.out")
+        if not uuid:
+            self.fail("Could not get UUID for a.out")
+            return
+        self.main_source_file = lldb.SBFileSpec("main.c")
+        self.tmp_dir = tempfile.mkdtemp()
+        self.test_dir = os.path.join(self.tmp_dir, "test")
+        os.makedirs(self.test_dir)
+
+        self.aout = ""
+        # Copy the files used by the test:
+        for f in local_files:
+            shutil.copy(self.getBuildArtifact(f), self.test_dir)
+            if self.aout == "":
+                self.aout = os.path.join(self.test_dir, f)
+
+        use_debuginfod = debuginfo != None or executable != None
+
+        # Populated the 'file://... mocked' Debuginfod server:
+        if use_debuginfod:
+            os.makedirs(os.path.join(self.tmp_dir, "cache"))
+            uuid_dir = os.path.join(self.tmp_dir, "buildid", uuid)
+            os.makedirs(uuid_dir)
+            if debuginfo:
+                shutil.copy(
+                    self.getBuildArtifact(debuginfo),
+                    os.path.join(uuid_dir, "debuginfo"),
+                )
+            if executable:
+                shutil.copy(
+                    self.getBuildArtifact(executable),
+                    os.path.join(uuid_dir, "executable"),
+                )
+        os.remove(self.getBuildArtifact("main.dwo"))
+        # Configure LLDB for the test:
+        self.runCmd(
+            "settings set symbols.enable-external-lookup %s"
+            % str(use_debuginfod).lower()
+        )
+        self.runCmd("settings clear plugin.symbol-locator.debuginfod.server-urls")
+        if use_debuginfod:
+            self.runCmd(
+                "settings set plugin.symbol-locator.debuginfod.cache-path %s/cache"
+                % self.tmp_dir
+            )
+            self.runCmd(
+                "settings insert-before plugin.symbol-locator.debuginfod.server-urls 0 file://%s"
+                % self.tmp_dir
+            )
+
+    def getUUID(self, filename):
+        try:
+            target = self.dbg.CreateTarget(self.getBuildArtifact(filename))
+            module = target.GetModuleAtIndex(0)
+            uuid = module.GetUUIDString().replace("-", "").lower()
+            self.dbg.DeleteTarget(target)
+            return uuid if len(uuid) == 40 else None
+        except:
+            return None
diff --git a/lldb/test/API/debuginfod/SplitDWARF/main.c b/lldb/test/API/debuginfod/SplitDWARF/main.c
new file mode 100644
index 000000000000..4c7184609b45
--- /dev/null
+++ b/lldb/test/API/debuginfod/SplitDWARF/main.c
@@ -0,0 +1,7 @@
+// This is a dump little pair of test files
+
+int func(int argc, const char *argv[]) {
+  return (argc + 1) * (argv[argc][0] + 2);
+}
+
+int main(int argc, const char *argv[]) { return func(0, argv); }
-- 
cgit v1.2.3


From b798c2af8b8888f8db2a9ec293066fb72d893f0c Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Thu, 4 Apr 2024 20:48:23 +0200
Subject: [libc++][CI] Updates to Clang 19. (#85301)

Since we have released Clang 16 is no longer actively supported. However
the FreeBSD runner is still using this, so some tests still guard
against Clang 16.
---
 .github/workflows/libcxx-build-and-test.yaml       | 28 +++++++++++-----------
 libcxx/docs/index.rst                              |  2 +-
 .../diagnose_invalid_memory_order.verify.cpp       |  2 +-
 .../no_unique_address.compile.pass.cpp             |  2 +-
 .../range.split/no_unique_address.compile.pass.cpp |  2 +-
 .../no_unique_address.compile.pass.cpp             |  2 +-
 6 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 4a881ef5ff56..1e9367732e59 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -38,11 +38,11 @@ env:
   # LLVM POST-BRANCH bump version
   # LLVM POST-BRANCH add compiler test for ToT - 1, e.g. "Clang 17"
   # LLVM RELEASE bump remove compiler ToT - 3, e.g. "Clang 15"
-  LLVM_HEAD_VERSION: "18"   # Used compiler, update POST-BRANCH.
-  LLVM_PREVIOUS_VERSION: "17"
-  LLVM_OLDEST_VERSION: "16"
+  LLVM_HEAD_VERSION: "19"   # Used compiler, update POST-BRANCH.
+  LLVM_PREVIOUS_VERSION: "18"
+  LLVM_OLDEST_VERSION: "17"
   GCC_STABLE_VERSION: "13"
-  LLVM_SYMBOLIZER_PATH: "/usr/bin/llvm-symbolizer-18"
+  LLVM_SYMBOLIZER_PATH: "/usr/bin/llvm-symbolizer-19"
   CLANG_CRASH_DIAGNOSTICS_DIR: "crash_diagnostics"
 
 
@@ -59,8 +59,8 @@ jobs:
           'generic-cxx26',
           'generic-modules'
         ]
-        cc: [  'clang-18' ]
-        cxx: [ 'clang++-18' ]
+        cc: [  'clang-19' ]
+        cxx: [ 'clang++-19' ]
         clang_tidy: [ 'ON' ]
         include:
           - config: 'generic-gcc'
@@ -100,22 +100,22 @@ jobs:
           'generic-cxx20',
           'generic-cxx23'
         ]
-        cc: [ 'clang-18' ]
-        cxx: [ 'clang++-18' ]
+        cc: [ 'clang-19' ]
+        cxx: [ 'clang++-19' ]
         clang_tidy: [ 'ON' ]
         include:
           - config: 'generic-gcc-cxx11'
             cc: 'gcc-13'
             cxx: 'g++-13'
             clang_tidy: 'OFF'
-          - config: 'generic-cxx23'
-            cc: 'clang-16'
-            cxx: 'clang++-16'
-            clang_tidy: 'OFF'
           - config: 'generic-cxx23'
             cc: 'clang-17'
             cxx: 'clang++-17'
             clang_tidy: 'OFF'
+          - config: 'generic-cxx26'
+            cc: 'clang-18'
+            cxx: 'clang++-18'
+            clang_tidy: 'ON'
     steps:
       - uses: actions/checkout@v4
       - name: ${{ matrix.config }}
@@ -186,8 +186,8 @@ jobs:
       - name: ${{ matrix.config }}
         run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
         env:
-          CC: clang-18
-          CXX: clang++-18
+          CC: clang-19
+          CXX: clang++-19
           ENABLE_CLANG_TIDY: "OFF"
       - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
         if: always()
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index db55c6f02a3d..743f99297d17 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -134,7 +134,7 @@ velocity, libc++ drops support for older compilers as newer ones are released.
 ============ =============== ========================== =====================
 Compiler     Versions        Restrictions               Support policy
 ============ =============== ========================== =====================
-Clang        16, 17, 18-git                             latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
+Clang        17, 18, 19-git                             latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
 AppleClang   15                                         latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
 Open XL      17.1 (AIX)                                 latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_
 GCC          13              In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
diff --git a/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp b/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
index defd43cf267a..2790916edaf6 100644
--- a/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
+++ b/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
@@ -9,7 +9,7 @@
 // This test fails with Clang <18 because diagnose_if doesn't emit all of the
 // diagnostics when -fdelayed-template-parsing is enabled, like it is in MSVC
 // mode.
-// XFAIL: msvc && (clang-16 || clang-17)
+// XFAIL: msvc && clang-17
 
 // REQUIRES: diagnose-if-support
 
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
index b411ce198e25..a0bfb7c4a246 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: msvc && (clang-16 || clang-17)
+// XFAIL: msvc && clang-17
 
 // class lazy_split_view {
 //   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
index 0d8bfbc0316d..694cf1fd0d0e 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: msvc && (clang-16 || clang-17)
+// XFAIL: msvc && clang-17
 
 // class split_view {
 //   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
diff --git a/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
index 8359d267245f..a77c4e4d1bcd 100644
--- a/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
@@ -8,7 +8,7 @@
 
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: msvc && (clang-16 || clang-17)
+// XFAIL: msvc && clang-17
 
 // Test the libc++ extension that the value stored in `std::ranges::istream_view` has been marked
 // as _LIBCPP_NO_UNIQUE_ADDRESS
-- 
cgit v1.2.3


From eeaaf33fc296d52a28252ba0d6cfe187b7b3412f Mon Sep 17 00:00:00 2001
From: Jonathan Peyton <jonathan.l.peyton@intel.com>
Date: Thu, 4 Apr 2024 13:54:40 -0500
Subject: [OpenMP] Unsupport absolute KMP_HW_SUBSET test for s390x (#87555)

---
 openmp/runtime/test/affinity/kmp-abs-hw-subset.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/openmp/runtime/test/affinity/kmp-abs-hw-subset.c b/openmp/runtime/test/affinity/kmp-abs-hw-subset.c
index 7b3493f1e5c4..025a239a1b83 100644
--- a/openmp/runtime/test/affinity/kmp-abs-hw-subset.c
+++ b/openmp/runtime/test/affinity/kmp-abs-hw-subset.c
@@ -6,6 +6,12 @@
 // RUN: env OMP_PLACES=threads %libomp-run 3 1
 // RUN: env OMP_PLACES=threads %libomp-run 3 2
 // REQUIRES: linux
+//
+// The test requires topologies with sockets, cores, threads layers where
+// the socket layer contains multiple threads.
+// The s390x architecture does not produce this topology and seems to have
+// one thread per socket.
+// UNSUPPORTED: s390x-target-arch
 
 #include <stdio.h>
 #include <stdlib.h>
-- 
cgit v1.2.3


From 8a0bfe490592de3df28d82c5dd69956e43c20f1d Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 4 Apr 2024 12:02:06 -0700
Subject: [SLP]Fix PR87630: wrong result for externally used vector value.

Need to check that the externally used value can be represented with the
BitWidth before applying it, otherwise need to keep wider type.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp              | 10 ++++++++++
 .../X86/external-user-instruction-minbitwidth.ll             | 12 ++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 79d146aec0c5..bdd26acfd2f8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14141,6 +14141,16 @@ bool BoUpSLP::collectValuesToDemote(
       }))
     return FinalAnalysis();
 
+  if (!all_of(I->users(),
+              [=](User *U) {
+                return getTreeEntry(U) ||
+                       (UserIgnoreList && UserIgnoreList->contains(U)) ||
+                       (U->getType()->isSized() &&
+                        DL->getTypeSizeInBits(U->getType()) <= BitWidth);
+              }) &&
+      !IsPotentiallyTruncated(I, BitWidth))
+    return false;
+
   unsigned Start = 0;
   unsigned End = I->getNumOperands();
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
index 9d3d602d0743..84f7e219f506 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
@@ -14,17 +14,13 @@ define i8 @test() {
 ; CHECK-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[CONV]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i16> [[TMP4]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 -32767>
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 32769>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[CONV1]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <8 x i32> [[TMP7]] to <8 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP9]])
-; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP8]])
 ; CHECK-NEXT:    [[CONV4_30:%.*]] = trunc i32 [[TMP11]] to i8
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
-; CHECK-NEXT:    [[TMP13:%.*]] = sext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
 ; CHECK-NEXT:    [[XOR_31:%.*]] = and i32 [[TMP13]], -2
 ; CHECK-NEXT:    store i32 [[XOR_31]], ptr @d, align 4
 ; CHECK-NEXT:    ret i8 [[CONV4_30]]
-- 
cgit v1.2.3


From 9a0ae081047d7088cdecfa86a8c90b721485e418 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 4 Apr 2024 12:21:50 -0700
Subject: [NFC][HWASAN] Simplify `selectiveInstrumentationShouldSkip` (#87670)

---
 .../Instrumentation/HWAddressSanitizer.cpp         | 36 ++++++++++------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index d0d349c891a3..88e84ed7be8e 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -317,7 +317,7 @@ private:
   };
 
   bool selectiveInstrumentationShouldSkip(Function &F,
-                                          FunctionAnalysisManager &FAM);
+                                          FunctionAnalysisManager &FAM) const;
   void initializeModule();
   void createHwasanCtorComdat();
 
@@ -1500,28 +1500,24 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
 }
 
 bool HWAddressSanitizer::selectiveInstrumentationShouldSkip(
-    Function &F, FunctionAnalysisManager &FAM) {
+    Function &F, FunctionAnalysisManager &FAM) const {
   if (ClRandomSkipRate.getNumOccurrences()) {
     std::bernoulli_distribution D(ClRandomSkipRate);
-    if (D(*Rng))
-      return true;
-  } else {
-    auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
-    ProfileSummaryInfo *PSI =
-        MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
-    if (PSI && PSI->hasProfileSummary()) {
-      auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
-      if ((ClHotPercentileCutoff.getNumOccurrences() &&
-           ClHotPercentileCutoff >= 0)
-              ? PSI->isFunctionHotInCallGraphNthPercentile(
-                    ClHotPercentileCutoff, &F, BFI)
-              : PSI->isFunctionHotInCallGraph(&F, BFI))
-        return true;
-    } else {
-      ++NumNoProfileSummaryFuncs;
-    }
+    return (D(*Rng));
   }
-  return false;
+  auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  ProfileSummaryInfo *PSI =
+      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  if (!PSI || !PSI->hasProfileSummary()) {
+    ++NumNoProfileSummaryFuncs;
+    return false;
+  }
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  return (
+      (ClHotPercentileCutoff.getNumOccurrences() && ClHotPercentileCutoff >= 0)
+          ? PSI->isFunctionHotInCallGraphNthPercentile(ClHotPercentileCutoff,
+                                                       &F, BFI)
+          : PSI->isFunctionHotInCallGraph(&F, BFI));
 }
 
 void HWAddressSanitizer::sanitizeFunction(Function &F,
-- 
cgit v1.2.3